lab3 | Notion

#Installing the necessary libraries
install.packages("readr", include_dependencies = TRUE)
library(readr)
install.packages("dplyr", include_dependencies = TRUE)
library(dplyr)
install.packages("tidyverse", include_dependencies = TRUE)
library(tidyverse)

#Working with the datasets library
library(datasets)
data(mtcars)

#Compare between the horsepower of cars with automatic/manual transmission
am.hp <- mean(mtcars$hp[mtcars$am == 0])
mn.hp <- mean(mtcars$hp[mtcars$am == 1])

#Alternatic Method using subset
am.hp <- mean(subset(mtcars, mtcars$am==0)[, 'hp'])
mn.hp <- mean(subset(mtcars, mtcars$am==1)[, 'hp'])

#Store the results in a dataframe
res <- data.frame(transmission= c('manual', 'automatic'),
                  horsepower=c(mn.hp,am.hp))

#Order the results descendingly
res <- res[order(res$average.power, decreasing = TRUE),]
View(res)

#Adding a Column with a constant value
mtcars$Year <- 2002
#Changing the value of a cell
mtcars["Datsun 710","Year"] <-2020
#Accessing a row
rw1 <- mtcars[1, ]
#Accessing an element
print(mtcars[1,1])

#Loading a Csv file
#Dataset link: <https://developer.ibm.com/data/airline/>
df <- read_csv("D:\\\\airline.csv")

#Viewing the dataset
View(df) #Print the dataset
head(df,3) #Show the first n rows
glimpse(df) #Showing the datatype of each column
typeof(df$Year) #Datatype of one column
df$Year[1:20] #Accessing elements
dim(df) # dimensions (rows,columns)

#Slicing the data, accessing columns
a <- df[1,1:10] #one row
df$DayOfWeek # one column

#Subsetting based on condition.
x <-subset(df, Year >2006 & Reporting_Airline =="FL")

#Subset of first 500,000 row.
df <-subset(df,select= c(Year,Month,Reporting_Airline,OriginStateName,DestStateName,DepDelayMinutes,ArrDelayMinutes,Cancelled,AirTime,Distance,DestCityName))[1:500000,]

#Basic statistics
round(mean(df$Year))
mean(df$ArrDelay, na.rm=TRUE)
min(df$Year)
max(df$ArrDelay, na.rm=TRUE)
range(df$Year)

#tapply function - apply a statistical function based on a categorical column
Delay.Per.Airline <- tapply(df$ArrDelayMinutes, INDEX = df$Reporting_Airline, FUN = mean, na.rm=TRUE)
Delay.Per.Airline <- data.frame(Airline=names(Delay.Per.Airline), Average.Delay = Delay.Per.Airline)
view(Delay.Per.Airline)

#Pipeline Operator is used to combine operations in a single command

#1- Find the average arrival delay Time for each airline
df %>%
  group_by(Reporting_Airline) %>%
  summarise(avgDelay= mean(ArrDelayMinutes, na.rm=TRUE))%>%
  head(10)

#Alternative to using pipeline
a <- group_by(df, Reporting_Airline)
b <- summarise(a, avgD=mean(ArrDelayMinutes, na.rm=TRUE))
c <- arrange(b, desc(avgD))
c <- c[1:10,]  

#2- Compare average  delay  for flights in November and January
df %>%
  filter(Month ==11 | Month == 1) %>%
  group_by(Month) %>%
  summarise(avgArrivalDelay= mean(ArrDelayMinutes, na.rm=TRUE),avgDepartureDelay = mean(DepDelayMinutes, na.rm=TRUE) ) 

#3- Which destination city receives the most flights in December
df %>%
  filter(Month==12) %>%
  count(DestCityName, sort = TRUE) %>%
  head(1)

#Alternative Method
df %>%
  filter(Month==12) %>%
  group_by(Month, DestCityName) %>%
  select(DestCityName)%>%
  summarise(n = n()) %>%
  arrange(desc(n)) %>%
  head(1)
  

#4- Find the speed of each flight?
df %>%
  filter(AirTime>0) %>%
  mutate(speed=Distance/(AirTime))  %>%
  select(DestCityName, Distance, AirTime, speed)%>%
  arrange(desc(speed))