# Installing the libraries with the specified version.
#!pip install numpy==1.25.2 pandas==1.5.3 matplotlib==3.7.1 seaborn==0.13.1 -q --user

!pip install numpy pandas matplotlib seaborn -q --user

"""
Ran into an error as shown below; hence, removed the versions and executed the command per instruction provided by the evaluator in the FoodHub Forum.

  AttributeError: module 'pkgutil' has no attribute 'ImpImporter'. Did you mean: 'zipimporter'?
  [end of output]
"""

"\nRan into an error as shown below; hence, removed the versions and executed the command per instruction provided by the evaluator in the FoodHub Forum.\n\n  AttributeError: module 'pkgutil' has no attribute 'ImpImporter'. Did you mean: 'zipimporter'?\n  [end of output]\n"

# import libraries for data manipulation
import numpy as np
import pandas as pd

# import libraries for data visualization
import matplotlib.pyplot as plt
import seaborn as sns

# I used Anaconda/Jupyter Notebook and didn't use Google Colab for this project completion. So, the below statements are commented.
# from google.colab import drive
# drive.mount('/content/drive')

# Read the dataset from the csv file
foodHub_df = pd.read_csv('foodhub_order.csv')

# View the first 5 rows from the DataFrame to ensure the csv file was loaded properly into the DataFrame.
foodHub_df.head()

# Check for whether duplicate records exist in the dataset.
foodHub_df.duplicated().sum()

0

# Write your code here
foodHub_df.shape

(1898, 9)

# Write your code here
foodHub_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1898 entries, 0 to 1897
Data columns (total 9 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   order_id               1898 non-null   int64  
 1   customer_id            1898 non-null   int64  
 2   restaurant_name        1898 non-null   object 
 3   cuisine_type           1898 non-null   object 
 4   cost_of_the_order      1898 non-null   float64
 5   day_of_the_week        1898 non-null   object 
 6   rating                 1898 non-null   object 
 7   food_preparation_time  1898 non-null   int64  
 8   delivery_time          1898 non-null   int64  
dtypes: float64(1), int64(4), object(4)
memory usage: 133.6+ KB

# Write your code here
foodHub_df.isnull().sum()

order_id                 0
customer_id              0
restaurant_name          0
cuisine_type             0
cost_of_the_order        0
day_of_the_week          0
rating                   0
food_preparation_time    0
delivery_time            0
dtype: int64

# Write your code here
foodHub_df.describe().T

# To get the count of orders not rated.
foodHub_df['rating'].value_counts()['Not given']

736

# To get a list of orders that weren't rated.
not_rated = foodHub_df[foodHub_df['rating'] == 'Not given'].reset_index(drop=True)
not_rated

# To visualize distribution of orders among the different cuisine types.
plt.figure(figsize=(20,7))
sns.countplot(data=foodHub_df, x='cuisine_type')
plt.title('Cuisine Type Vs Number of Orders')
plt.xlabel('Cuisine Type')
plt.ylabel('No. of orders');
#plt.xticks(rotation=90)

plt.figure(figsize=(20,7))
sns.countplot(data=foodHub_df, x='cuisine_type', hue='day_of_the_week')
plt.title('Cuisine Type Vs Day of The Week')
plt.xlabel('Cuisine Type')
plt.ylabel('Day of The Week');
#plt.xticks(rotation=90)

#plt.figure(figsize=(20,7))
sns.boxplot(data=foodHub_df, x='food_preparation_time')
plt.xlabel('Time taken to prepare food in minutes');
#plt.ylabel('No. of orders');
#plt.xticks(rotation=90)

# Generate subplots to determine correlation between cuisine type and food preparation time.
g = sns.FacetGrid(foodHub_df, col="cuisine_type")
g.map(sns.histplot, "food_preparation_time")
plt.xlabel('Time taken to prepare food');
#plt.show()

# Visualize delivery time using box plot to determine minimum, 25th, 50th, 75th percentile and maximum time.
sns.boxplot(data=foodHub_df, x='delivery_time')
plt.title('Time to deliver food after preparation');
plt.xlabel('Delivery Time in minutes');

# Visualize correlation between cuisine type and customer rating to detemine the highly rated cuisine types.
plt.figure(figsize=(20,7))
sns.countplot(data=foodHub_df, x='cuisine_type', hue='rating')
plt.title('Cuisine Type vs Number of Orders')
plt.xlabel('Cuisine Type')
plt.ylabel('No. of orders');
#plt.xticks(rotation=90)

# To get a number of orders per restaurant. The default sort value is descending; so, no need to specify sort order explicitly.
orders_count = foodHub_df['restaurant_name'].value_counts()

#To get the top 5 restaurants in terms of number of orders received
orders_count.head()

restaurant_name
Shake Shack                  219
The Meatball Shop            132
Blue Ribbon Sushi            119
Blue Ribbon Fried Chicken     96
Parm                          68
Name: count, dtype: int64

# First extract all rows of orders placed over the weekends
foodHub_df_weekend = foodHub_df[foodHub_df['day_of_the_week'].isin(['Weekend'])]
foodHub_df_weekend

# Second group the rows extracted in step 1 by cuisine type and sort the results in descending order of counts and fetch the first row.
foodHub_df_weekend.groupby(['cuisine_type'])['order_id'].count().sort_values(ascending=False).head(1)

cuisine_type
American    415
Name: order_id, dtype: int64

""" 
Alternatively, the below code would fetch the output using one line statement. However, the one line statement could 
potentially add complexity in terms of reading, comprehending and maintaning the code as it depends on level of proficiency in Python.
So, a step-by-step process is recommeded to promote readability and maintanability.
"""
foodHub_df[foodHub_df['day_of_the_week'].isin(['Weekend'])].groupby(['cuisine_type'])['order_id'].count().sort_values(ascending=False).head(1)

cuisine_type
American    415
Name: order_id, dtype: int64

# get total number of order with order value of more than $20.00
foodHub_df_Over_20 = foodHub_df[foodHub_df['cost_of_the_order'] > 20.00].shape[0]

# Calculate % of orders cost more than 20 dollars by dividing count of orders with over 20 dollars by total number of orders.
foodHub_df_Over_pct = (foodHub_df_Over_20/foodHub_df.shape[0])* 100

# Limit the decimal places to 2 decimals.
foodHub_df_Over_pct = f"{foodHub_df_Over_pct:.2f}"

# Print the result
print('The percentage of orders cost more than 20 dollars is ',foodHub_df_Over_pct,'%.',sep='')

The percentage of orders cost more than 20 dollars is 29.24%.

# Generate the statistical summary
foodHub_df.describe().T

# Get top 3 most frequent customers
foodHub_df['customer_id'].value_counts().head(3)

customer_id
52832    13
47440    10
83287     9
Name: count, dtype: int64

# Heatmap to visualize (postive or negative) correlation among cost of the order, food prepration time and delivery time.
sns.heatmap(foodHub_df[['cost_of_the_order','food_preparation_time','delivery_time']].corr(),annot=True,vmin=-1,vmax=1);

# Determine relationship between "Food Preparation Time and "Delivery Time" using line plot
plt.figure(figsize=(15,7))
sns.lineplot(data=foodHub_df, x='food_preparation_time', y ='delivery_time',errorbar=None)
plt.title('Food prepration Time Vs Delivery Time')
plt.xlabel('Food Preparation Time in minutes')
plt.ylabel('Delivery Time in minutes')
plt.show()

# Determine relationship between "Delivery Time" and "Rating" using scatter plot
plt.figure(figsize=(15,7))
plt.title('Delivery Time Vs Customer Rating')
plt.xlabel('Delivery Time in minutes')
plt.ylabel('Customer Rating (0 - 5)')
sns.scatterplot(data=foodHub_df, x='delivery_time', y ='rating')
plt.show()

# Determine relationship between "Delivery Time" and "Rating"
plt.figure(figsize=(15,7))
sns.scatterplot(data=foodHub_df, x='food_preparation_time', y ='cost_of_the_order')
plt.title('Food Preparation Time Vs Cost of The Order')
plt.xlabel('Food Preparation Time in minutes')
plt.ylabel('Cost of The Order')
plt.show()

# Determine relationship/correlation between "Food Preparation Time" and "Cost of The Order". Using hex to visualize better.
plt.figure(figsize=(15,7))
sns.jointplot(data=foodHub_df, x='food_preparation_time', y ='cost_of_the_order', kind='hex')
plt.title('Food Preparation Time Vs Cost of The Order')
plt.xlabel('Food Preparation Time in minutes')
plt.ylabel('Cost of The Order in USD')
plt.show()

<Figure size 1500x700 with 0 Axes>

# Boxplot to visalize correlation between Delivery Time and Day of The Week
plt.title('Delivery Time Vs Day of The Week')
plt.xlabel('Delivery Time in minutes')
plt.ylabel('Day of The Week')
sns.boxplot(data=foodHub_df, x='delivery_time', y ='day_of_the_week');

# Determine relationship between "Delivery Time" and "Rating"
plt.figure(figsize=(15,7))
sns.scatterplot(data=foodHub_df, x='delivery_time', y ='rating', hue='cuisine_type')
plt.title('Delivery Time Vs Customer Rating')
plt.xlabel('Delivery Time in minutes')
plt.ylabel('Customer Rating (0 - 5)')
plt.show()

"""
The below code contains multiple steps to increase readability and also to ensure 
that anyone looking at the code can easily understand from maintenance perspective.
"""
# Applied rating > 4 condition to filter out rows with rating = 5 only. This would improve performance as well.
foodHub_df_rating_over4 = foodHub_df[~foodHub_df['rating'].isin(['Not given', '4', '3', '2', '1', '0'])]
#foodHub_df_rating_over4

# Then grouped the records by "restaurant name and rating" to determine count for each combination.
foodHub_df_rating_count = foodHub_df_rating_over4.groupby(['restaurant_name','rating'])['rating'].count().sort_values(ascending=False)
#foodHub_df_rating_count

# Then applied the filter to extract only the restaurants that have customer rating count over 50.
foodHub_df_rating_over4_over50 = foodHub_df_rating_count[foodHub_df_rating_count > 50]
foodHub_df_rating_over4_over50

restaurant_name    rating
Shake Shack        5         60
The Meatball Shop  5         53
Name: rating, dtype: int64

# Create a DataFrame containing just the "cost_of_the_order"column from foodHub_df.
cost_of_order = foodHub_df['cost_of_the_order']

# Declare and initialize variables to store sum values.
revenue_over_20 = 0
revenue_over_5 = 0

# Loop through to calculate cumulative values of the sums
for i in cost_of_order:
    if i > 20.00:
        revenue_over_20 = revenue_over_20 + (i * 0.25)
    elif i > 5:
        revenue_over_5 = revenue_over_5 + (i * 0.15)

# Calculate the net_revenue
net_revenue = revenue_over_20 + revenue_over_5

# Print the calculated values.
print('Revenue generated from orders over dollar 20 is', revenue_over_20) 
print('Revenue generated from orders over dollar 5 but less than or equal to 20 is', revenue_over_5)
print('NET revenue generated in dollars',net_revenue)

Revenue generated from orders over dollar 20 is 3688.7275000000027
Revenue generated from orders over dollar 5 but less than or equal to 20 is 2477.5755000000036
NET revenue generated in dollars 6166.303000000006

# under the existing foodHub_df, create a new column named "total_time_required_to_deliver" by adding both food_preparation_time and delivery_time
foodHub_df['total_time_required_to_deliver'] = foodHub_df['food_preparation_time'] + foodHub_df['delivery_time']

# Create a new dataframe containing just the orders that took over 60 minutes to deliver the food.
order_with_delivery_time_over60 = foodHub_df[foodHub_df['total_time_required_to_deliver'] > 60]

# Calculate the percent of overall orders that took more than 60 minutes to deliver the food.
pct_of_order_over60_minutes_delivery_time = (order_with_delivery_time_over60.shape[0]/foodHub_df.shape[0])*100

# print the percentage.
print('The percent of total order that took over 60 minutes for delivery is', pct_of_order_over60_minutes_delivery_time)

The percent of total order that took over 60 minutes for delivery is 10.537407797681771

# Generate statistics for orders received/processed during "Weekend"
foodHub_df[foodHub_df['day_of_the_week'].isin(['Weekend'])].describe().T

# Generate statistics for orders received/processed during "Weekend"
foodHub_df[foodHub_df['day_of_the_week'].isin(['Weekday'])].describe().T

# Determine number of orders received during "Weekday"
foodHub_df[foodHub_df['day_of_the_week'].isin(['Weekday'])].shape[0] # 547 orders

547

# Determine number of orders received during "Weekend"
foodHub_df[foodHub_df['day_of_the_week'].isin(['Weekend'])].shape[0] # 1351 orders

1351

	count	mean	std	min	25%	50%	75%	max
order_id	1898.0	1.477496e+06	548.049724	1476547.00	1477021.25	1477495.50	1.477970e+06	1478444.00
customer_id	1898.0	1.711685e+05	113698.139743	1311.00	77787.75	128600.00	2.705250e+05	405334.00
cost_of_the_order	1898.0	1.649885e+01	7.483812	4.47	12.08	14.14	2.229750e+01	35.41
food_preparation_time	1898.0	2.737197e+01	4.632481	20.00	23.00	27.00	3.100000e+01	35.00
delivery_time	1898.0	2.416175e+01	4.972637	15.00	20.00	25.00	2.800000e+01	33.00

	order_id	customer_id	restaurant_name	cuisine_type	cost_of_the_order	day_of_the_week	rating	food_preparation_time	delivery_time
0	1477147	337525	Hangawi	Korean	30.75	Weekend	Not given	25	20
1	1477685	358141	Blue Ribbon Sushi Izakaya	Japanese	12.08	Weekend	Not given	25	23
2	1477894	157711	The Meatball Shop	Italian	6.07	Weekend	Not given	28	21
3	1477895	143926	Big Wong Restaurant _¤¾Ñ¼	Chinese	5.92	Weekday	Not given	34	28
4	1478198	62667	Lucky's Famous Burgers	American	12.13	Weekday	Not given	23	30
...	...	...	...	...	...	...	...	...	...
731	1476873	237616	Shake Shack	American	5.82	Weekend	Not given	26	30
732	1476981	138586	Shake Shack	American	5.82	Weekend	Not given	22	28
733	1477473	97838	Han Dynasty	Chinese	29.15	Weekend	Not given	29	21
734	1477819	35309	Blue Ribbon Sushi	Japanese	25.22	Weekday	Not given	31	24
735	1478056	120353	Blue Ribbon Sushi	Japanese	19.45	Weekend	Not given	28	24

	order_id	customer_id	restaurant_name	cuisine_type	cost_of_the_order	day_of_the_week	rating	food_preparation_time	delivery_time	total_time_required_to_deliver
0	1477147	337525	Hangawi	Korean	30.75	Weekend	Not given	25	20	45
1	1477685	358141	Blue Ribbon Sushi Izakaya	Japanese	12.08	Weekend	Not given	25	23	48
3	1477334	106968	Blue Ribbon Fried Chicken	American	29.20	Weekend	3	25	15	40
6	1477894	157711	The Meatball Shop	Italian	6.07	Weekend	Not given	28	21	49
11	1478437	221206	Empanada Mama (closed)	Mexican	8.10	Weekend	5	23	22	45
...	...	...	...	...	...	...	...	...	...	...
1891	1476981	138586	Shake Shack	American	5.82	Weekend	Not given	22	28	50
1892	1477473	97838	Han Dynasty	Chinese	29.15	Weekend	Not given	29	21	50
1893	1476701	292602	Chipotle Mexican Grill $1.99 Delivery	Mexican	22.31	Weekend	5	31	17	48
1894	1477421	397537	The Smile	American	12.18	Weekend	5	31	19	50
1897	1478056	120353	Blue Ribbon Sushi	Japanese	19.45	Weekend	Not given	28	24	52

	count	mean	std	min	25%	50%	75%	max
order_id	1898.0	1.477496e+06	548.049724	1476547.00	1477021.25	1477495.50	1.477970e+06	1478444.00
customer_id	1898.0	1.711685e+05	113698.139743	1311.00	77787.75	128600.00	2.705250e+05	405334.00
cost_of_the_order	1898.0	1.649885e+01	7.483812	4.47	12.08	14.14	2.229750e+01	35.41
food_preparation_time	1898.0	2.737197e+01	4.632481	20.00	23.00	27.00	3.100000e+01	35.00
delivery_time	1898.0	2.416175e+01	4.972637	15.00	20.00	25.00	2.800000e+01	33.00

	count	mean	std	min	25%	50%	75%	max
order_id	1351.0	1.477500e+06	554.168089	1476547.00	1477015.50	1477505.00	1477980.500	1478444.00
customer_id	1351.0	1.704226e+05	113403.749068	1311.00	77665.00	127934.00	268958.000	405334.00
cost_of_the_order	1351.0	1.657483e+01	7.486979	4.47	12.08	14.41	22.285	33.37
food_preparation_time	1351.0	2.743671e+01	4.601476	20.00	23.00	27.00	31.000	35.00
delivery_time	1351.0	2.247002e+01	4.628938	15.00	18.50	22.00	27.000	30.00
total_time_required_to_deliver	1351.0	4.990674e+01	6.624883	35.00	45.00	50.00	55.000	65.00

	count	mean	std	min	25%	50%	75%	max
order_id	547.0	1.477483e+06	532.945358	1476559.00	1477043.00	1477457.00	1477929.000	1478435.00
customer_id	547.0	1.730108e+05	114505.333032	5693.00	78913.00	130521.00	276192.000	404649.00
cost_of_the_order	547.0	1.631119e+01	7.479521	4.75	12.08	14.07	22.285	35.41
food_preparation_time	547.0	2.721207e+01	4.708616	20.00	23.00	27.00	31.000	35.00
delivery_time	547.0	2.834004e+01	2.891428	24.00	26.00	28.00	31.000	33.00
total_time_required_to_deliver	547.0	5.555210e+01	5.567972	44.00	51.00	55.00	59.500	68.00

Project Python Foundations: FoodHub Data Analysis¶

Context¶

Objective¶

Data Description¶

Data Dictionary¶

Let us start by importing the required libraries¶

Understanding the structure of the data¶

Observation¶

Observation¶

Question 1: How many rows and columns are present in the data? [0.5 mark]¶

Observations:¶

Question 2: What are the datatypes of the different columns in the dataset? (The info() function can be used) [0.5 mark]¶

Observations:¶

Question 3: Are there any missing values in the data? If yes, treat them using an appropriate method. [1 mark]¶

Observation:¶

Question 4: Check the statistical summary of the data. What is the minimum, average, and maximum time it takes for food to be prepared once an order is placed? [2 marks]¶

Observations:¶

Question 5: How many orders are not rated? [1 mark]¶

Observation:¶

Exploratory Data Analysis (EDA)¶

Univariate Analysis¶

Question 6: Explore all the variables and provide observations on their distributions. (Generally, histograms, boxplots, countplots, etc. are used for univariate exploration.) [9 marks]¶

Observations¶

Observations¶

Observations¶

Observations¶

Observations¶

Observations¶

Question 7: Which are the top 5 restaurants in terms of the number of orders received? [1 mark]¶

Observations:¶

Question 8: Which is the most popular cuisine on weekends? [1 mark]¶

Observation¶

Question 9: What percentage of the orders cost more than 20 dollars? [2 marks]¶

Observation¶

Question 10: What is the mean order delivery time? [1 mark]¶

Observation¶

Question 11: The company has decided to give 20% discount vouchers to the top 3 most frequent customers. Find the IDs of these customers and the number of orders they placed. [1 mark]¶

Observation¶

Multivariate Analysis¶

Question 12: Perform a multivariate analysis to explore relationships between the important variables in the dataset. (It is a good idea to explore relations between numerical variables as well as relations between numerical and categorical variables) [10 marks]¶

Observation¶

Observation¶

Observation¶

Observations¶

Observations¶

Observations¶

Observation¶

Observation¶

Question 14: The company charges the restaurant 25% on the orders having cost greater than 20 dollars and 15% on the orders having cost greater than 5 dollars. Find the net revenue generated by the company across all orders. [3 marks]¶

Observations¶

Question 15: The company wants to analyze the total time required to deliver the food. What percentage of orders take more than 60 minutes to get delivered from the time the order is placed? (The food has to be prepared and then delivered.) [2 marks]¶

Observations¶

Question 16: The company wants to analyze the delivery time of the orders on weekdays and weekends. How does the mean delivery time vary during weekdays and weekends? [2 marks]¶

Observations¶

Conclusion and Recommendations¶

Question 17: What are your conclusions from the analysis? What recommendations would you like to share to help improve the business? (You can use cuisine type and feedback ratings to drive your business recommendations.) [6 marks]¶

Conclusions:¶

Recommendations:¶