#Installing the necessary libraries
install.packages("readr", include_dependencies = TRUE)
library(readr)
install.packages("dplyr", include_dependencies = TRUE)
library(dplyr)
install.packages("tidyverse", include_dependencies = TRUE)
library(tidyverse)
#Working with the datasets library
library(datasets)
data(mtcars)
#Compare between the horsepower of cars with automatic/manual transmission
am.hp <- mean(mtcars$hp[mtcars$am == 0])
mn.hp <- mean(mtcars$hp[mtcars$am == 1])
#Alternatic Method using subset
am.hp <- mean(subset(mtcars, mtcars$am==0)[, 'hp'])
mn.hp <- mean(subset(mtcars, mtcars$am==1)[, 'hp'])
#Store the results in a dataframe
res <- data.frame(transmission= c('manual', 'automatic'),
horsepower=c(mn.hp,am.hp))
#Order the results descendingly
res <- res[order(res$average.power, decreasing = TRUE),]
View(res)
#Adding a Column with a constant value
mtcars$Year <- 2002
#Changing the value of a cell
mtcars["Datsun 710","Year"] <-2020
#Accessing a row
rw1 <- mtcars[1, ]
#Accessing an element
print(mtcars[1,1])
#Loading a Csv file
#Dataset link: <https://developer.ibm.com/data/airline/>
df <- read_csv("D:\\\\airline.csv")
#Viewing the dataset
View(df) #Print the dataset
head(df,3) #Show the first n rows
glimpse(df) #Showing the datatype of each column
typeof(df$Year) #Datatype of one column
df$Year[1:20] #Accessing elements
dim(df) # dimensions (rows,columns)
#Slicing the data, accessing columns
a <- df[1,1:10] #one row
df$DayOfWeek # one column
#Subsetting based on condition.
x <-subset(df, Year >2006 & Reporting_Airline =="FL")
#Subset of first 500,000 row.
df <-subset(df,select= c(Year,Month,Reporting_Airline,OriginStateName,DestStateName,DepDelayMinutes,ArrDelayMinutes,Cancelled,AirTime,Distance,DestCityName))[1:500000,]
#Basic statistics
round(mean(df$Year))
mean(df$ArrDelay, na.rm=TRUE)
min(df$Year)
max(df$ArrDelay, na.rm=TRUE)
range(df$Year)
#tapply function - apply a statistical function based on a categorical column
Delay.Per.Airline <- tapply(df$ArrDelayMinutes, INDEX = df$Reporting_Airline, FUN = mean, na.rm=TRUE)
Delay.Per.Airline <- data.frame(Airline=names(Delay.Per.Airline), Average.Delay = Delay.Per.Airline)
view(Delay.Per.Airline)
#Pipeline Operator is used to combine operations in a single command
#1- Find the average arrival delay Time for each airline
df %>%
group_by(Reporting_Airline) %>%
summarise(avgDelay= mean(ArrDelayMinutes, na.rm=TRUE))%>%
head(10)
#Alternative to using pipeline
a <- group_by(df, Reporting_Airline)
b <- summarise(a, avgD=mean(ArrDelayMinutes, na.rm=TRUE))
c <- arrange(b, desc(avgD))
c <- c[1:10,]
#2- Compare average delay for flights in November and January
df %>%
filter(Month ==11 | Month == 1) %>%
group_by(Month) %>%
summarise(avgArrivalDelay= mean(ArrDelayMinutes, na.rm=TRUE),avgDepartureDelay = mean(DepDelayMinutes, na.rm=TRUE) )
#3- Which destination city receives the most flights in December
df %>%
filter(Month==12) %>%
count(DestCityName, sort = TRUE) %>%
head(1)
#Alternative Method
df %>%
filter(Month==12) %>%
group_by(Month, DestCityName) %>%
select(DestCityName)%>%
summarise(n = n()) %>%
arrange(desc(n)) %>%
head(1)
#4- Find the speed of each flight?
df %>%
filter(AirTime>0) %>%
mutate(speed=Distance/(AirTime)) %>%
select(DestCityName, Distance, AirTime, speed)%>%
arrange(desc(speed))