import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Importing data from the excel file
df = pd.read_excel(r'Coffee Shop Sales.xlsx', sheet_name = 'Transactions')
df.head()

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 149116 entries, 0 to 149115
Data columns (total 17 columns):
 #   Column            Non-Null Count   Dtype         
---  ------            --------------   -----         
 0   transaction_id    149116 non-null  int64         
 1   transaction_date  149116 non-null  datetime64[ns]
 2   transaction_time  149116 non-null  object        
 3   transaction_qty   149116 non-null  int64         
 4   store_id          149116 non-null  int64         
 5   store_location    149116 non-null  object        
 6   product_id        149116 non-null  int64         
 7   unit_price        149116 non-null  float64       
 8   product_category  149116 non-null  object        
 9   product_type      149116 non-null  object        
 10  product_detail    149116 non-null  object        
 11  Revenue           149116 non-null  float64       
 12  Month             149116 non-null  int64         
 13  Month.1           149116 non-null  object        
 14  Weekday           149116 non-null  int64         
 15  Weekday.1         149116 non-null  object        
 16  Hour              149116 non-null  int64         
dtypes: datetime64[ns](1), float64(2), int64(7), object(7)
memory usage: 19.3+ MB

df.drop(columns=['transaction_time','store_id','Month','Weekday','product_id'], inplace = True )
df.rename(columns={'Month.1':'Month','Weekday.1':'Weekday'}, inplace = True)

df.describe()

df[df.duplicated()]

fig, ax = plt.subplots( 1,3 ,figsize = (11,4))
sns.boxplot(data = df[[ 'transaction_qty']], ax = ax[0])
sns.boxplot(data = df[[ 'unit_price' ]], ax = ax[1])
sns.boxplot(data = df[[ 'Revenue' ]], ax = ax[2])
plt.show()

# grouping store locations with the total revenue earned by each store
df_revenue = df.groupby('store_location')['Revenue'].sum().reset_index()
# Dividing the Revenue by 1000 for easy representation in k dollars
df_revenue['Revenue'] = round(df_revenue['Revenue']/1000,2)

plt.figure(figsize=(5,3))
sns.barplot(x = 'store_location', y= 'Revenue', data = df_revenue)
plt.ylim(0,250)
plt.title('Total Revenue based on store location (k$)')
plt.xlabel('Store Location')
plt.ylabel('Revenue (k$)')
plt.show()

units_per_transaction = df.groupby('transaction_qty').size()
plt.figure(figsize=(5,3))
sns.barplot(x= units_per_transaction.index, y= units_per_transaction.values, color ='dodgerblue')
plt.xlabel('Number of Units')
plt.ylabel('Transactions')

Text(0, 0.5, 'Transactions')

transaction_by_category = df.groupby('product_category')['transaction_qty'].sum().reset_index()
transaction_by_category = transaction_by_category[['product_category', 'transaction_qty']].sort_values(by='transaction_qty', ascending=False)

total = transaction_by_category['transaction_qty'].sum()
def categorize(row,total):
    perct = (row['transaction_qty']/total)*100
    return row['product_category'] if perct > 6 else 'Others'

transaction_by_category['category'] = transaction_by_category.apply(categorize,total = total,axis=1) 
pie_cat = transaction_by_category.groupby('category')['transaction_qty'].sum().sort_values()


fig , ax = plt.subplots(1,2, figsize = (12,5))
#plt.figure(figsize = (7,3))
sns.barplot(y='product_category', x= 'transaction_qty', data=transaction_by_category, orient='h', ax=ax[0], palette='Blues_d')
ax[0].set_title('Number of units sold by each category')
ax[0].set_ylabel('Product category')
ax[0].set_xlabel('Units sold')

colors = ['#ADD8E6', '#87CEEB', '#1E90FF', '#4169E1', '#0202F5']
ax[1].pie(pie_cat.values, labels = pie_cat.index,autopct='%1.1f%%', colors=colors)
ax[1].set_title('Category distribution Pie chart')
fig.subplots_adjust(left = 0.05, right = 0.95)
plt.show()

unique_prices = df.drop_duplicates(subset=['product_category','unit_price'])

fig , ax = plt.subplots(figsize=(8,4))

sns.boxplot(x = 'product_category', y='unit_price', data = unique_prices, flierprops = {'marker':'o','markerfacecolor':'none'})

plt.ylabel('Unit price ($)', fontsize= 12)
plt.xlabel('Product category', fontsize = 12)
plt.xticks(rotation = 45)
plt.title('Unit price distribution for Product categories', fontsize = 13)

Text(0.5, 1.0, 'Unit price distribution for Product categories')

# highest revenue generated by month
month_rev = df.groupby(['Month'])['Revenue'].sum().reset_index()

#sorting by month
month_order = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun']

month_rev['month_order'] = month_rev['Month'].map({Month: i for i,Month in enumerate(month_order)}) 
month_rev = month_rev.sort_values(by = 'month_order').drop('month_order',axis=1)

# daily revenue and quantity
daily_revenue = df.groupby('transaction_date')['Revenue'].sum().reset_index()
daily_qty = df.groupby('transaction_date')['transaction_qty'].sum().reset_index()

fig, (ax,ax0) = plt.subplots(1,2, figsize = (20,8))
sns.barplot(x = 'Month', y ='Revenue', data = month_rev, ax=ax, palette = 'Blues')
ax.set_title('Total Monthly revenue for 2023 ')


ax1 = ax0.twinx()
ax0.plot('transaction_date', 'Revenue', data=daily_revenue , color='red')
ax1.plot('transaction_date', 'transaction_qty', data=daily_qty, color='blue')
ax0.set_ylabel("Revenue", color='red')
ax0.set_xlabel('Transaction Date')
ax1.set_ylabel('Sale Quantity', color='blue')
plt.title('Daily sales and Revenue trend over 6 months')
plt.show()

weekday_revenue = df.groupby('Weekday')['Revenue'].sum().reset_index()

# sorting based on the week
weekday_order = ['Mon','Tue','Wed','Thu','Fri','Sat','Sun']
weekday_revenue['order'] = weekday_revenue['Weekday'].map({Weekday: i for i,Weekday in enumerate(weekday_order) }) 

weekday_revenue = weekday_revenue.sort_values(by='order').drop(columns='order')

plt.figure(figsize=(5,3))
sns.barplot(x=weekday_revenue['Weekday'], y = weekday_revenue['Revenue'], color = 'dodgerblue')
plt.title('Revenue over different days of week')
plt.show()

# calculating average sales over the day
total_sales_per_hour_of_day = df.groupby('Hour')['Revenue'].sum()
unique_dates = df.drop_duplicates(['transaction_date', 'Hour'])

# Counting how many times specific hours are recorded
hours_of_day_count = unique_dates['Hour'].value_counts(sort=True).sort_index(ascending=True)
avg_hourly_sales_per_day = total_sales_per_hour_of_day / hours_of_day_count

plt.figure(figsize=( 5,4))
plt.plot(avg_hourly_sales_per_day.index, avg_hourly_sales_per_day.values)
plt.title('Avearge hourly sales in a day')
plt.xlabel('Hour')
plt.ylabel('Average sales')

Text(0, 0.5, 'Average sales')

pivoted = df.pivot_table(index = 'product_category', columns='store_location', values='transaction_qty', aggfunc = 'sum')
pivoted['Total'] =  pivoted.sum(axis = 1)
pivoted.sort_values(by='Total', ascending = False, inplace = True)
pivoted.drop(columns = ['Total'], inplace = True)

pivoted.plot(kind='bar', figsize = (10,5))
plt.xticks(rotation = 45)
plt.xlabel('Product category')
plt.ylabel('Total transaction quantity')
plt.title('Distribution of each product category purchased per store')
plt.show()

	transaction_id	transaction_date	transaction_time	transaction_qty	store_id	store_location	product_id	unit_price	product_category	product_type	product_detail	Revenue	Month	Month.1	Weekday	Weekday.1	Hour
0	1	2023-01-01	07:06:11	2	5	Lower Manhattan	32	3.0	Coffee	Gourmet brewed coffee	Ethiopia Rg	6.0	1	Jan	7	Sun	7
1	2	2023-01-01	07:08:56	2	5	Lower Manhattan	57	3.1	Tea	Brewed Chai tea	Spicy Eye Opener Chai Lg	6.2	1	Jan	7	Sun	7
2	3	2023-01-01	07:14:04	2	5	Lower Manhattan	59	4.5	Drinking Chocolate	Hot chocolate	Dark chocolate Lg	9.0	1	Jan	7	Sun	7
3	4	2023-01-01	07:20:24	1	5	Lower Manhattan	22	2.0	Coffee	Drip coffee	Our Old Time Diner Blend Sm	2.0	1	Jan	7	Sun	7
4	5	2023-01-01	07:22:41	2	5	Lower Manhattan	57	3.1	Tea	Brewed Chai tea	Spicy Eye Opener Chai Lg	6.2	1	Jan	7	Sun	7

	transaction_id	transaction_date	transaction_qty	unit_price	Revenue	Hour
count	149116.000000	149116	149116.000000	149116.000000	149116.000000	149116.000000
mean	74737.371872	2023-04-15 11:50:32.173609984	1.438276	3.382219	4.686367	11.735790
min	1.000000	2023-01-01 00:00:00	1.000000	0.800000	0.800000	6.000000
25%	37335.750000	2023-03-06 00:00:00	1.000000	2.500000	3.000000	9.000000
50%	74727.500000	2023-04-24 00:00:00	1.000000	3.000000	3.750000	11.000000
75%	112094.250000	2023-05-30 00:00:00	2.000000	3.750000	6.000000	15.000000
max	149456.000000	2023-06-30 00:00:00	8.000000	45.000000	360.000000	20.000000
std	43153.600016	NaN	0.542509	2.658723	4.227099	3.764662

Coffee Shop Sales Analytics - EDA¶

Nitin Reddy Manda¶

Introduction¶

Key stakeholders¶

Objectives¶

Dataset¶

Data Cleaning and Preparation¶

Importing neccessary libraries and Dataset¶

Checking for duplicate values¶

Checking for outliers¶

Exploratory Data Analyis¶

Univariate Analysis¶

Let's look at revenue of each store over the year 2023¶

Let's check how many units were commonly bought on each transaction¶

Let's look at which category has the highest number of units sold¶

Unit price distribution among categories¶

Bivariate Analysis¶

Looking at the Revenue and sales performed over months and days¶

Looking for Weekday trends in revenue¶

Let's look at the busiest time of the day¶

Multi variate Analysis¶

Lets see if there is a preferred product category across the three stores¶

Final Analysis:¶

Suggestions:¶

Conclusion¶