% distinct() %>% count() df<- df %>% distinct() #Turn last reviews to date format and remove missing values df$last.review[df$last.review == ''] <- NA df$last.review <- as.Date(df$last.review,"%m/%d/%Y") df$last.review[is.na(df$last.review)]<- min(df$last.review) #Cleaning the price column #Remove $ sign from the price column df$price <- as.numeric(gsub("\\$","", df$price)) df$service.fee <- as.numeric(gsub("\\$","", df$service.fee)) Finding all missing values colSums(is.na(df)) #Price missing values sum(is.na(df$price)) df$price[is.na(df$price)] <- round(mean(df$price, na.r"> % distinct() %>% count() df<- df %>% distinct() #Turn last reviews to date format and remove missing values df$last.review[df$last.review == ''] <- NA df$last.review <- as.Date(df$last.review,"%m/%d/%Y") df$last.review[is.na(df$last.review)]<- min(df$last.review) #Cleaning the price column #Remove $ sign from the price column df$price <- as.numeric(gsub("\\$","", df$price)) df$service.fee <- as.numeric(gsub("\\$","", df$service.fee)) Finding all missing values colSums(is.na(df)) #Price missing values sum(is.na(df$price)) df$price[is.na(df$price)] <- round(mean(df$price, na.r"> % distinct() %>% count() df<- df %>% distinct() #Turn last reviews to date format and remove missing values df$last.review[df$last.review == ''] <- NA df$last.review <- as.Date(df$last.review,"%m/%d/%Y") df$last.review[is.na(df$last.review)]<- min(df$last.review) #Cleaning the price column #Remove $ sign from the price column df$price <- as.numeric(gsub("\\$","", df$price)) df$service.fee <- as.numeric(gsub("\\$","", df$service.fee)) Finding all missing values colSums(is.na(df)) #Price missing values sum(is.na(df$price)) df$price[is.na(df$price)] <- round(mean(df$price, na.r">
#Dataset Link: <https://github.com/adishourya/Airbnb/blob/master/new-york-city-airbnb-open-data/AB_NYC_2019.csv>
df <- read.csv("Airbnb_Open_Data.csv")
head(df,3)
View(df)

#Drop unwanted columns
df <- subset(df, select = -c(id,host.id,host.name,lat,long, calculated.host.listings.count,house_rules,license,review.rate.number))

#Find column datatypes
lapply(df, typeof)

#Remove duplicate rows
library(dplyr)
df %>% distinct() %>% count()
df<- df %>% distinct()

#Turn last reviews to date format and remove missing values
df$last.review[df$last.review == ''] <- NA
df$last.review  <- as.Date(df$last.review,"%m/%d/%Y")
df$last.review[is.na(df$last.review)]<- min(df$last.review)

#Cleaning the price column
#Remove $ sign from the price column 
df$price <- as.numeric(gsub("\\\\$","", df$price))
df$service.fee <- as.numeric(gsub("\\\\$","", df$service.fee))

Finding all missing values
colSums(is.na(df))

#Price missing values
sum(is.na(df$price))
df$price[is.na(df$price)] <- round(mean(df$price, na.rm = TRUE))
#Alternative - replace missing values with 0
df$price <- ifelse(is.na(df$price), 0, df$price)

#Service Fee missing values
perc<- df$service.fee / df$price
value<-mean(perc, na.rm=TRUE)
df$service.fee <- ifelse(is.na(df$service.fee),round(df$price*value), df$service.fee)

#Handling all missing values using pipeline
library(tidyverse)
df <- df %>%
  replace_na(list(Construction.year =round(mean(df$Construction.year,na.rm=TRUE)),
                  minimum.nights = 1,
                  number.of.reviews = 0,
                  reviews.per.month = 0,
                  availability.365=365))

#Instant Bookable missing values - Boolean Variable
table(df$instant_bookable) #Majority of rows are false
df$instant_bookable[is.na(df$instant_bookable)] <- FALSE

#Convert names to lower case
df$NAME <-lapply(df$NAME,tolower)

#Select all numerical columns
numeric_columns <- sapply(df, is.numeric)
df_numeric <- df[,numeric_columns]
df_numeric['last.review'] <-df$last.review
View(df_numeric)

#Calculate mean price per neighbourhood
mean_price <- tapply(df$price,INDEX=df$neighbourhood.group,FUN=mean, na.rm=TRUE)

#Analyze the price based on the neighbourhood group
res <- df %>%
  group_by(neighbourhood.group) %>%
  summarise(min_price = min(price), max_price = max(price), avg_price = mean(price)) %>%
  arrange(desc(avg_price))

#Show result as a dataframe
res<- data.frame(Area=res['neighbourhood.group'],
                 minimum=res['min_price'],maximum=res['max_price'],
                 average=res['avg_price'])
View(res)

#Find the neighbourhoods that have the most 'Entire home/apt' type of rooms
df %>%
  filter(room.type == 'Entire home/apt')%>%
  group_by(neighbourhood.group) %>%
  count(room.type)%>%
  arrange(desc(n))