# libraries
import glob
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from wordcloud import WordCloud, STOPWORDS
import warnings
'ignore')
warnings.filterwarnings(
# read data
## products dataset
= '../input/learnplatform-covid19-impact-on-digital-learning/'
path = pd.read_csv(path + "products_info.csv")
products_df = [x.lower().replace(' ','_') for x in products_df.columns]
products_df.columns
## districts dataset
= pd.read_csv(path +"districts_info.csv")
districts_df #districts_df.state = districts_df.state.replace('District Of Columbia','District of Columbia')
## engagement dataset
= '../input/learnplatform-covid19-impact-on-digital-learning/engagement_data/'
path = glob.glob(path + "/*.csv")
all_files
= []
li
for filename in all_files:
= pd.read_csv(filename, index_col=None, header=0)
df = filename.split("/")[-1].split(".")[0]
district_id "district_id"] = district_id
df[
li.append(df)= pd.concat(li)
engagement_df = engagement_df.reset_index(drop=True)
engagement_df
# summary
= [
df_list
districts_df,
products_df,
engagement_df
]
= [
df_name 'districts_df',
'products_df',
'engagement_df'
]
= [
cols 'dataframe',
'column',
'dtype',
'Non-Null Count',
'Null Count',
'unique'
]
=[]
frames
for i in range(len(df_list)):
= df_list[i].copy()
df = df.dtypes.reset_index().rename(columns = {'index':'column',0:'dtype'})
a = df.count().reset_index().rename(columns = {'index':'column',0:'Non-Null Count'})
b = df.isnull().sum().reset_index().rename(columns = {'index':'column',0:'Null Count'})
c = a.merge(b,on = 'column').merge(c,on = 'column')
temp
= {col: len(df[col].unique()) for col in df.columns}
dct = pd.DataFrame({
df_unique 'column':dct.keys(),
'unique':dct.values(),
})= temp.merge(df_unique,on = 'column')
temp 'dataframe'] = df_name[i]
temp[ frames.append(temp)
Basic analysis: COVID-19 Impact on Digital Learning
Main objective is understand of the best way the challenge LearnPlatform COVID-19 Impact on Digital Learning proposed by Kaggle.
The steps to follow are:
- Overview of the Dataset: Understanding the datasets available.
- Preprocessing: Preprocessing of the datasets available.
- EDA: Exploratory data analysis using visualization tools in Python.
Note: My analysis is inspired by several of the notebooks that different profiles have uploaded to the challenge, so some graphics or images belong to these authors. The most important ones will be found in the references. On the other hand, my project is available in Jupyter Book, click in the following link.
Overview of the Dataset
The objective of this section is to be able to read and give an interpretation to each one of the available datasets, analyzing column by column. For each dataset we will make a brief description:
- File: File name (
.csv
). - Shape: Dimensionality of datasets.
- Description: Basic description of the dataset.
- Top 5 rows: Show first 5 rows + explanation for some columns.
- Summary: Summary of datasets.
1. Districts
- File:
districts_info.csv
. - Shape: \(233\) rows \(\times\) \(7\) columns.
- Description: file contains information about each school district.
- Top 5 rows::
- Summary:
2. Products
File:
products_info.csv
Shape: \(372\) rows \(\times\) \(6\) columns.
Description: for each school district, there is an additional file that contains the engagement for each tool for everyday in 2020.
Top 5 rows::
Summary:
3. Engagement
File:
engagement_data/*.csv
.Shape: \(22324190\) rows \(\times\) \(5\) columns.
Description: file contains information about each school district. The files can be joined by the key columns
district_id
andlp_id
.Top 5 rows::
Summary:
Preprocessing
Preprocessing is an important step in any analytics competition. It helps you to handle your data more efficiently. However, please note that the way I preprocess the data may not be suited for your analysis purposes. Therefore, before you begin preprocessing your data, think about which data you would like to keep and/or modify and which data is not relevant for your analysis.
- one-hot encoding the product sectors
- splitting up the primary essential function into main and sub category
Note: Preprocessing varies if you see other notebooks of this challenge. The processing will depend on the understanding of each of the datasets and the extra information that you may have.
# products_df
'primary_function_main'] = products_df['primary_essential_function'].apply(lambda x: x.split(' - ')[0] if x == x else x)
products_df['primary_function_sub'] = products_df['primary_essential_function'].apply(lambda x: x.split(' - ')[1] if x == x else x)
products_df[
# Synchronize similar values
'primary_function_sub'] = products_df['primary_function_sub'].replace({'Sites, Resources & References' : 'Sites, Resources & Reference'})
products_df[#products_df.drop("Primary Essential Function", axis=1, inplace=True)
= products_df['sector(s)'].str.get_dummies(sep="; ")
temp_sectors = [f"sector_{re.sub(' ', '', c)}" for c in temp_sectors.columns]
temp_sectors.columns = products_df.join(temp_sectors)
products_df #products_df.drop("Sector(s)", axis=1, inplace=True)
#del temp_sectors
# engagement_df
'time'] = pd.to_datetime(engagement_df['time'])
engagement_df[
= engagement_df[['time']].drop_duplicates('time')
temp 'week'] = temp['time'].apply(lambda x: x.isocalendar()[1])
temp[= engagement_df.merge(temp,on ='time')
engagement_df
'lp_id'] = engagement_df['lp_id'].fillna(-1).astype(int)
engagement_df['district_id'] = engagement_df['district_id'].fillna(-1).astype(int)
engagement_df[
= engagement_df.merge(
engagement_df_mix 'district_id','state']],
districts_df[[= 'district_id'
on
)= engagement_df_mix.merge(
engagement_df_mix 'lp_id','product_name','sector_Corporate', 'sector_HigherEd','sector_PreK-12']],
products_df[[= 'lp_id'
on )
EDA
Exploratory data analysis is the most important part of the challenge, since this will make the difference between the winner and the other participants. You should keep in mind that your visualizations must be able to simply and easily summarize the datasets. Also, it is hoped that the proposed visualizations can help to understand behaviors that are not easy to analyze with a simple table.
Visualizations will be made in matplotlib, seaborn y plotly. Based on the article by Diverging Color Maps for Scientific Visualization (Expanded) - Kenneth Moreland, we will occupy Grays
scale next to the technique: dark text on a light background.
Note: Visualizations made on this notebook are static. You can use different tools to be able to make dynamic visualizations (Altair, plotly, etc.). You can also perform tools like Streamlit to make Dashboards. On the other hand, if you fully understand python visualization tools and have knowledge of HTML/CSS, you can make beautiful notebook presentations like this one.
Visualization: Districts
First of all, I am interested how diverse the available school districts are. As you can see in below plot, the available data does not cover all the states in the U.S. . The states with the most available school districts are CT (30) and UT (29) while there are also states with only one school district (FL, TN, NY, AZ).
# map plot: districts
= {
us_state_abbrev 'Alabama': 'AL',
'Alaska': 'AK',
'American Samoa': 'AS',
'Arizona': 'AZ',
'Arkansas': 'AR',
'California': 'CA',
'Colorado': 'CO',
'Connecticut': 'CT',
'Delaware': 'DE',
'District Of Columbia': 'DC',
'Florida': 'FL',
'Georgia': 'GA',
'Guam': 'GU',
'Hawaii': 'HI',
'Idaho': 'ID',
'Illinois': 'IL',
'Indiana': 'IN',
'Iowa': 'IA',
'Kansas': 'KS',
'Kentucky': 'KY',
'Louisiana': 'LA',
'Maine': 'ME',
'Maryland': 'MD',
'Massachusetts': 'MA',
'Michigan': 'MI',
'Minnesota': 'MN',
'Mississippi': 'MS',
'Missouri': 'MO',
'Montana': 'MT',
'Nebraska': 'NE',
'Nevada': 'NV',
'New Hampshire': 'NH',
'New Jersey': 'NJ',
'New Mexico': 'NM',
'New York': 'NY',
'North Carolina': 'NC',
'North Dakota': 'ND',
'Northern Mariana Islands':'MP',
'Ohio': 'OH',
'Oklahoma': 'OK',
'Oregon': 'OR',
'Pennsylvania': 'PA',
'Puerto Rico': 'PR',
'Rhode Island': 'RI',
'South Carolina': 'SC',
'South Dakota': 'SD',
'Tennessee': 'TN',
'Texas': 'TX',
'Utah': 'UT',
'Vermont': 'VT',
'Virgin Islands': 'VI',
'Virginia': 'VA',
'Washington': 'WA',
'West Virginia': 'WV',
'Wisconsin': 'WI',
'Wyoming': 'WY'
}
'state_abbrev'] = districts_df['state'].replace(us_state_abbrev)
districts_df[= districts_df['state_abbrev'].value_counts().to_frame().reset_index(drop=False)
districts_info_by_state = ['state_abbrev', 'num_districts']
districts_info_by_state.columns
= pd.DataFrame({
temp 'state_abbrev':us_state_abbrev.values(),
})
= temp.merge(districts_info_by_state,on='state_abbrev',how='left').fillna(0)
temp 'num_districts'] = temp['num_districts'].astype(int)
temp[
= go.Figure()
fig = dict(
layout = "Number of Available School Districts per State",
title_text ="black",
title_font_color='usa',
geo_scope
)
fig.add_trace(
go.Choropleth(=temp.state_abbrev,
locations=1,
zmax= temp.num_districts,
z = 'USA-states', # set of locations match entries in `locations`
locationmode ='black',
marker_line_color='geo',
geo=px.colors.sequential.Greys,
colorscale
)
)
fig.update_layout(layout) fig.show()
# bar plot: districts
'default')
plt.style.use(=(14,8))
plt.figure(figsize
= sns.countplot(
plotting ="state",
y=districts_df,
data=districts_df.state.value_counts().index,
order="Greys_d",
palette=3
linewidth
)
for container in plotting.containers:
=16)
plotting.bar_label(container,fontsize
#Text
= -5, y = -4.2, s = "State Distribution",fontsize = 24, weight = 'bold', alpha = .90);
plotting.text(x = -5, y = -3, s = "Distribution of United States",fontsize = 16, alpha = .85)
plotting.text(x = 31.2, y = 0.08, s = 'Highest', weight = 'bold',fontsize = 14)
plotting.text(x = 1.7, y = 22.3, s = 'Lowest', weight = 'bold',fontsize = 14)
plotting.text(x
=14)
plt.yticks(fontsize=14)
plt.xticks(fontsize
plt.show()
Visualization: Locales
Locales are separated into 4 categories: Suburb,Rural, City and Town, where most of the locales are concentrated in the Suburb category (104).
For the pct_black/hispanic
variable, Rural and Town categories concentrate their entire population close to the interval $ [0,0.2 [$, while for the others sectors this percentage is varied.
For pctfree/reduced
and pp_total_raw
indicators, the distribution for each location is different, although they tend to focus on a particular interval.
# heatmap: districts -> locale
= districts_df.groupby('locale').pp_total_raw.value_counts().to_frame()
temp = ['amount']
temp.columns
= temp.reset_index(drop=False)
temp
= temp.pivot(index='locale', columns='pp_total_raw')['amount']
temp = temp[['[4000, 6000[', '[6000, 8000[', '[8000, 10000[', '[10000, 12000[',
temp '[12000, 14000[', '[14000, 16000[', '[16000, 18000[',
'[18000, 20000[', '[20000, 22000[', '[22000, 24000[', ]]
= districts_df.groupby('locale')['pct_black/hispanic'].value_counts().to_frame()
temp1 = ['amount']
temp1.columns
= temp1.reset_index(drop=False)
temp1 = temp1.pivot(index='locale', columns='pct_black/hispanic')['amount']
temp1
= districts_df.groupby('locale')['pct_free/reduced'].value_counts().to_frame()
temp2 = ['amount']
temp2.columns
= temp2.reset_index(drop=False)
temp2
= temp2.pivot(index='locale', columns='pct_free/reduced')['amount']
temp2
'default')
plt.style.use(
= plt.subplots(nrows=2, ncols=2, figsize=(24,18))
fig, [[ax1, ax2], [ax3, ax4]]
=districts_df, x='locale', ax=ax1, palette='Greys_d')
sns.countplot(data= -0.5, y = 120, s = "Locale Distribution",fontsize = 24, weight = 'bold', alpha = .90);
ax1.text(x =16)
ax1.xaxis.set_tick_params(labelsize
for container in ax1.containers:
=16)
ax1.bar_label(container,fontsize
0), annot=True, cmap='Greys', ax=ax2,annot_kws={"fontsize":14})
sns.heatmap(temp1.fillna('Heatmap: locale and pct_black/hispanic',fontsize=16,loc='left')
ax2.set_title(=16)
ax2.xaxis.set_tick_params(labelsize=16)
ax2.yaxis.set_tick_params(labelsize
0), annot=True, cmap='Greys', ax=ax3,annot_kws={"fontsize":14})
sns.heatmap(temp.fillna('Heatmap: locale and pp_total_raw',fontsize=16,loc='left')
ax3.set_title(=16)
ax3.xaxis.set_tick_params(labelsize=16)
ax3.yaxis.set_tick_params(labelsize
0), annot=True, cmap='Greys', ax=ax4,annot_kws={"fontsize":14})
sns.heatmap(temp2.fillna('Heatmap: locale and pct_free/reduced',fontsize=16,loc='left')
ax4.set_title(=16)
ax4.xaxis.set_tick_params(labelsize=16)
ax4.yaxis.set_tick_params(labelsize
plt.show()
Visualization: Sectors
Sectors are separated into 3 categories: sector_Corporate, sector_HigherEd and sector_PreK-12, donde la categoría mayoritaria corresponde a sector_PreK-12 (350). On the other hand, analyzing the primary_function_main
variable, all sectors are focused on theLC
category. It is worth mentioning that the distribution of the other categories remains almost the same between sectors.
'default')
plt.style.use(= ['sector_Corporate', 'sector_HigherEd','sector_PreK-12']
names = [products_df[x].sum() for x in names]
counts
= pd.DataFrame({
temp_bar 'sector':names,
'count':counts
'count',ascending = False)
}).sort_values(
= products_df.groupby('primary_function_main')[names].sum()
temp
#fig, [ax1, ax2 ]= plt.subplots(nrows=1, ncols=2, figsize=(12,6))
=(18,18))
plt.figure(figsize
3,2,1)
plt.subplot(= sns.barplot(x="sector", y="count", data=temp_bar,palette ='Greys_d')
ax for container in ax.containers:
=12)
ax.bar_label(container,fontsize
3,2,2)
plt.subplot(
=True, cmap='Greys',annot_kws={"fontsize":10},fmt='g')
sns.heatmap(temp.T, annot
= -6, y = -0.25, s = "Sectors Distribution",fontsize = 18, weight = 'bold', alpha = .90);
plt.text(x
plt.show()
Visualization: primary_function_main
Continuing the analysis of the primary_function_main
variable, it was observed that most of these are in theLC
category (77%). Within this category, its subcategory is analyzed, where the predominant subcategory is Sites, Resources & Reference
(101).
# pieplot: products
= [
color 'darkgray',
'silver',
'lightgray',
'gainsboro',
]
"primary_function_main"].value_counts().plot(
products_df[= 'pie',
kind ='%1d%%',
autopct=(6,6),
figsize=color,
colors={"edgecolor":"k",'linewidth': 0.8,},
wedgeprops={'color':"black"},
textprops=0)
startangle= -1.4, y = 1.1, s = "Categories",fontsize = 18, weight = 'bold', alpha = .90);
plt.text(x plt.show()
# pieplot: products -> subcategories
'default')
plt.style.use(=(18,8))
plt.figure(figsize
= products_df[products_df.primary_function_main == 'LC']
temp = sns.countplot(
ax =temp,
data='primary_function_sub',
y=temp.primary_function_sub.value_counts().index,
order='Greys_d'
palette
)
for container in ax.containers:
=16)
ax.bar_label(container,fontsize
#plt.title('Sub-Categories in Primary Function LC')
= -50, y = -0.8,
plt.text(x = "Sub-Categories in Primary Function LC",fontsize = 24, weight = 'bold', alpha = .90);
s
= 105, y =0.08, s = 'Highest', weight = 'bold',fontsize=16)
plt.text(x = 7, y = 6, s = 'Lowest', weight = 'bold',fontsize=16)
plt.text(x =16)
plt.yticks(fontsize=16)
plt.xticks(fontsize
plt.show()
Visualization: Products
After understanding the functionality of each of the tools, it is necessary to understand the distribution of the tools. The first thing is to study the distribution of the providers of the products we have, where:
- 258 providers have 1 occurrences.
- 18 providers have 2 occurrences.
- 9 providers have 3 occurrences.
- 2 providers have 4 occurrences.
- 2 providers have 6 occurrences.
- 1 provider have 30 occurrences.
Based on this, only the top 15 providers will be displayed.
= {
dct 'Savvas Learning Company | Formerly Pearson K12 Learning': 'Savvas Learning Company'
}
= products_df['provider/company_name'].value_counts().reset_index()
temp = ['provider/company_name','count']
temp.columns = temp.replace( {
temp 'Savvas Learning Company | Formerly Pearson K12 Learning': 'Savvas Learning Company'
})
= 15
n = temp.sort_values('count',ascending = False).head(n)
temp
'default')
plt.style.use(=(18,8))
plt.figure(figsize
= sns.barplot(
ax =temp,
data='provider/company_name',
y='count',
x='Greys_d'
palette
)
for container in ax.containers:
=15)
ax.bar_label(container,fontsize
= -7, y = -1,
plt.text(x = f"Top {n} provider/company name",fontsize = 20, weight = 'bold', alpha = .90);
s
= 31, y =0.08, s = 'Highest', weight = 'bold',fontsize=16)
plt.text(x = 3, y = 14.2, s = 'Lowest', weight = 'bold',fontsize=16)
plt.text(x =16)
plt.yticks(fontsize
=16)
plt.yticks(fontsize=16)
plt.xticks(fontsize plt.show()
With regard to products, there are about 372 different products.
We can make a word cloud to be able to analyze in a different way, words by themselves that are repeated the most in the product_name
variable.
= WordCloud(
cloud =1080,
width=270,
height='Greys',
colormap='white'
background_color" ".join(products_df['product_name'].astype(str)))
).generate(
=(22, 10))
plt.figure(figsize
plt.imshow(cloud)'off'); plt.axis(
To understand more in detail the use of these products, we will analyze the use of these products with respect to the variable engagement_index
. The first graph is related to the average engagement_index
(per student) for the year 2020, where the first 15 products will be displayed.
An important fact is that 362 products have an average of less than 1!.
= (engagement_df_mix.groupby('product_name')['engagement_index'].mean()/1000).reset_index().sort_values('engagement_index',ascending = False)
group_01 'engagement_index'] = group_01['engagement_index'].apply(lambda x: round(x,2))
group_01[= len(group_01.loc[lambda x:x['engagement_index']<1])
less_1
'default')
plt.style.use(=(14,8))
plt.figure(figsize
= sns.barplot(
plotting ="product_name",
y= "engagement_index",
x =group_01.head(20),
data="Greys_d",
palette
)
for container in plotting.containers:
=14)
plotting.bar_label(container,fontsize
= -3.5, y = -3,
plt.text(x = "Mean daily page-load events in top 20 tools",fontsize = 20, weight = 'bold', alpha = .90);
s
= -3.5, y = -2,
plt.text(x = "per 1 student",fontsize = 14, alpha = .90);
s
= 11, y =0.1, s = 'Highest', weight = 'bold',fontsize=14)
plt.text(x = 1, y = 19.2, s = 'Lowest', weight = 'bold',fontsize=14)
plt.text(x =16)
plt.yticks(fontsize=16)
plt.xticks(fontsize plt.show()
Let’s study the temporal behavior (at the level of weeks) of these tools during the year 2020, where the most three used tools will be shown with different colors, while the other tools will be visualized but with the same color (in order to understand their distribution).
Note: The proposed analysis can be carried out at the day level and analyzing through time series each of the tools during the year 2020.
= 'week'
col
= (engagement_df_mix.groupby(['product_name',col])['engagement_index'].mean()/1000).reset_index().sort_values('engagement_index',ascending = False)
group_04
= group_01.head(3)['product_name']
g_high = group_04.loc[lambda x: x.product_name.isin(g_high)]
group_04_top
= group_04['product_name'].unique()
states = group_04[col].unique()
times
= pd.MultiIndex.from_product([states,times], names = ["product_name", col])
index
= pd.DataFrame(index = index).reset_index().fillna(0)
df_complete
= df_complete.merge(group_04,on = ['product_name',col],how='left').fillna(0)
group_04
= 3
n = group_04.groupby('product_name')['engagement_index'].sum().sort_values(ascending=False).head(n).index.to_list()
g_high
= [
colors 'lightgray',
'dimgray',
'black',
'firebrick',
'darkred']
= {x:'lavender' for x in group_04['product_name'].unique() if x not in g_high}
palette_01 = {g_high[i]:colors[i] for i in range(n)}
palette_02
'default')
plt.style.use(=(20,6))
plt.figure(figsize
sns.lineplot(=group_04.loc[lambda x: ~x.product_name.isin(g_high)],
data=col,
x="engagement_index",
y='product_name',
hue= False,
legend =palette_01,
palette= 1.
linewidth
)
sns.lineplot(=group_04.loc[lambda x: x.product_name.isin(g_high)],
data=col,
x="engagement_index",
y='product_name',
hue=palette_02,
palette= 1.
linewidth
)
= -2, y =23.7, s = 'Mean daily page-load events in top 3 tools', weight = 'bold',fontsize=14)
plt.text(x = -2, y =22.3, s = 'by products and time, per 1 student',fontsize=12)
plt.text(x
= 12, y =20.7, s = '1,000 cases of COVID', weight = 'bold',fontsize=8)
plt.text(x = 37, y =20.7, s = '1st September', weight = 'bold',fontsize=8)
plt.text(x
= 11, color = 'black', linestyle='--',linewidth = 0.5)
plt.axvline(x = 36, color = 'black', linestyle='--',linewidth = 0.5)
plt.axvline(x plt.show()
Now, we can understand the engagement index
for the most important tools about districts, where the districts of * Wisconsin , Missouri * and * Virginia * have the highest engagement index
among the three most used tools.
= (engagement_df_mix.groupby(['state','product_name'])['engagement_index'].mean()/1000)\
group_02 'engagement_index',ascending = False).fillna(0)
.reset_index().sort_values(
= group_02.loc[lambda x: x.product_name.isin(g_high)]
gripo_02_top 'engagement_index'] = gripo_02_top['engagement_index'].apply(lambda x: round(x,2))
gripo_02_top[#gripo_02_top = gripo_02_top.loc[lambda x: x['engagement_index']>0]
'default')
plt.style.use(
= sns.FacetGrid(gripo_02_top,hue='product_name',col = 'product_name',height=4, col_wrap= 3 )
g map(sns.barplot, "engagement_index","state", palette="Greys_d",)
g.
15, 8)
g.fig.set_size_inches(=0.81, right=0.86)
g.fig.subplots_adjust(top
= g.axes.flatten()
axes for ax in axes:
for container in ax.containers:
=8)
ax.bar_label(container,fontsize
= -50, y = -4, s = "Mean daily page-load events in top 3 tools",fontsize = 16, weight = 'bold', alpha = .90);
plt.text(x = -50, y = -3, s = "by state and products, per 1 student",fontsize = 14, alpha = .90);
plt.text(x plt.show()
Summary
- Depending on what you want to achieve you might want to carefully preselect districts. Note that we approach in this notebook might not necessarily suit your individual purposes.
- When looking at digital learning, you might want to spend sometime in figuring out which districts actually applied digital learning