Summary
This notebook presents different approaches for exploratory data analysis and basic statistical analysis. Python functions are written to make nice data visualization including histogram, CDF, swarm plot, box plot, Violin plot, Kernel Density Estimation, cross plot and correlation. Probability distribution including binomial PMF, Poisson, Normal and Exponential distribution will be discussed. Finally data manipulation and data analysis with Pandas will be presented.
Python functions and data files needed to run this notebook are available via this link.
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib
import pylab as plt
from functions import* # import require functions to run this notebook
import warnings
warnings.filterwarnings('ignore')
The data is 2008 US swing state election results downloaded from https://www.kaggle.com/datasets/aman1py/swing-states?resource=download
df_swing=pd.read_csv('./Data/2008_swing_states.csv')
df_swing[['state','county','dem_share']][:5]
font = {'size' : 7}
plt.rc('font', **font)
fig, ax1 = plt.subplots(figsize=(10, 7), dpi= 100, facecolor='w', edgecolor='k')
gs = gridspec.GridSpec(2, 4)
ax1=plt.subplot(gs[0, :2], )
KPI='dem_share'
val=df_swing[df_swing['state']=='OH'][KPI]
EDA_plot.histplt (val,bins=20,title='OH State',xlabl=KPI,
ylabl='Percentage',xlimt=(0,100),ylimt=(0,0.2),axt=ax1,
scale=1.15,loc=1,font=10,color='#8DC63F')
ax2=plt.subplot(gs[0, 2:])
KPI='dem_share'
val=df_swing[df_swing['state']=='PA'][KPI]
EDA_plot.histplt (val,bins=20,title='PA State',xlabl=KPI,
ylabl='Percentage',xlimt=(0,100),ylimt=(0,0.2),axt=ax2,
scale=1.15,loc=1,font=10,color='#025174')
ax3=plt.subplot(gs[1, 1:3])
val=df_swing[df_swing['state']=='FL'][KPI]
EDA_plot.histplt (val,bins=20,title='FL State',xlabl=KPI,
ylabl='Percentage',xlimt=(0,100),ylimt=(0,0.2),axt=ax3
,scale=1.15,loc=1,font=10,color='#323539')
plt.subplots_adjust(wspace=0.9)
plt.subplots_adjust(hspace=0.45)
fig.suptitle('2008 US Swing State Election_Histogram of dem_share for each State', fontsize=16,y=1.02)
plt.show()
font = {'size' : 12}
plt.rc('font', **font)
fig, ax1 = plt.subplots(figsize=(8, 4), dpi= 100, facecolor='w', edgecolor='k')
sns.set_theme(style="whitegrid")
tips = sns.load_dataset("tips")
ax = sns.swarmplot(x="state", y="dem_share", data=df_swing)
df_means = df_swing.groupby("state")["dem_share"].agg("mean").reset_index()
xlim = ax.get_xlim()
ylim = ax.get_ylim()
sns.scatterplot(x="state", y="dem_share", marker='X', color='black',
s=100, zorder=3, ax=ax, legend=True, data=df_means)
ax.set_xlim(xlim)
ax.set_ylim(ylim)
plt.show()
The center of box plot is 50th percentile which has a higher limit of 75th percentile and lower limit of 25th percentile. This is called IQR (Interquartile Range). The reset extend of data is 1.5 IQR.
font = {'size' : 7}
plt.rc('font', **font)
fig, ax1 = plt.subplots(figsize=(8, 4), dpi= 100, facecolor='w', edgecolor='k')
sns.set_theme(style="whitegrid")
tips = sns.load_dataset("tips")
ax = sns.swarmplot(x="state", y="dem_share", data=df_swing)
ax = sns.boxplot(x="state", y="dem_share", data=df_swing)
df_means = df_swing.groupby("state")["dem_share"].agg("mean").reset_index()
xlim = ax.get_xlim()
ylim = ax.get_ylim()
sns.scatterplot(x="state", y="dem_share", marker='X', color='black',
s=100, zorder=3, ax=ax, legend=True, data=df_means)
ax.set_xlim(xlim)
ax.set_ylim(ylim)
plt.show()
#plt.style.available
plt.style.use('classic')
KPI='dem_share'
val=[]
val.append(np.array(df_swing[df_swing['state']=='PA'][KPI]))
val.append(np.array(df_swing[df_swing['state']=='OH'][KPI]))
val.append(np.array(df_swing[df_swing['state']=='FL'][KPI]))
#
Position=[100,500,1000]
font = {'size' : 11}
plt.rc('font', **font)
fig, ax1 = plt.subplots(figsize=(9, 5), dpi= 130, facecolor='w', edgecolor='k')
v1 = plt.violinplot(val, points=100, positions=Position, widths=300,
showmeans=True,showextrema=True, showmedians=False,bw_method =0.15)
for b in v1['bodies']:
m = np.mean(b.get_paths()[0].vertices[:, 0])
b.get_paths()[0].vertices[:, 0] = np.clip(b.get_paths()[0].vertices[:, 0], -np.inf, m)
b.set_color('darkred')
b.set_edgecolor('k')
b.set_linewidth(1)
b.set_alpha(0.9)
#plt.axis('off')
plt.ylabel('dem_share',fontsize=12)
plt.xticks([100,500,1000],['PA','OH','FL'],rotation=0)
plt.title("Violin Plot",fontweight="bold",fontsize=16)
plt.show()
font = {'size' : 11}
plt.rc('font', **font)
fig, ax1 = plt.subplots(figsize=(9, 8), dpi= 130, facecolor='w', edgecolor='k')
ax1=plt.subplot(2,1,1)
label=['OH State','PA State','FL State']
data_var=len(label)*[None]
KPI='dem_share'
data_var[0]=np.array(df_swing[df_swing['state']=='OH'][KPI])
data_var[1]=np.array(df_swing[df_swing['state']=='PA'][KPI])
data_var[2]=np.array(df_swing[df_swing['state']=='FL'][KPI])
nvar=3
xs = np.linspace(0,100,500)
colors = ['r','b','g']
title='2008 US Swing State Election_Kernel Density Estimation for dem_share for each State'
EDA_plot.KDE(xs,data_var,nvar,label,colors,xlabel=None,title=title,ylabel='Probability',xlim=(0,100),ylim=(0,0.01),
LAMBA=0.5,linewidth=2.5,loc=0,axt=ax1,x_ftze=12,y_ftze=12,tit_ftze=12,leg_ftze=9)
font = {'size' : 11}
plt.rc('font', **font)
fig, ax1 = plt.subplots(figsize=(9, 8), dpi= 130, facecolor='w', edgecolor='k')
label=['OH State','PA State','FL State']
data_var=len(label)*[None]
KPI='dem_share'
data_var[0]=np.array(df_swing[df_swing['state']=='OH'][KPI])
data_var[1]=np.array(df_swing[df_swing['state']=='PA'][KPI])
data_var[2]=np.array(df_swing[df_swing['state']=='FL'][KPI])
ax1=plt.subplot(2,1,1)
title=f'2008 US Swing State Election_Emperical CDF_ for dem_share for each State'
EDA_plot.CDF_plot(data_var,nvar,label,colors,xlabel=None,title=title,ylabel='Cumulative Probability',bins=100,
xlim=(0,100),ylim=(0,1),linewidth=2.5,loc=0,axt=ax1,x_ftze=12,y_ftze=12,tit_ftze=12,leg_ftze=9)
plt.show()
font = {'size' : 11}
plt.rc('font', **font)
fig, ax1 = plt.subplots(figsize=(8, 8), dpi= 130, facecolor='w', edgecolor='k')
label=['FL State']
KPI='dem_share'
data_var=np.array(df_swing[df_swing['state']=='FL'][KPI])
ax1=plt.subplot(2,1,1)
title=f'2008 US Swing State Election_Emperical CDF_for dem_share for FL State'
EDA_plot.CDF_plot(data_var,1,label,colors='g',xlabel=None,title=title,ylabel='Cumulative Probability',bins=100,
xlim=(0,90),ylim=(0,1),linewidth=2.5,loc=4,axt=ax1,x_ftze=12,y_ftze=12,tit_ftze=12,leg_ftze=9)
# 25th percentile
per=0.25
Perctile=np.quantile(data_var, per, axis=0, keepdims=True)[0]
txt= '25th percentile= '
txt+=f'{np.round(Perctile,1)}'
plt.plot([0, Perctile], [per, per], 'k--', linewidth=1.2)
plt.plot([Perctile, Perctile], [0, per], 'k--', linewidth=1.2)
plt.text(Perctile/6, per+0.03, txt, color='b',
fontsize=10)
# 50th percentile
per=0.5
Perctile=np.quantile(data_var, per, axis=0, keepdims=True)[0]
txt= '50th percentile= '
txt+=f'{np.round(Perctile,1)}'
plt.plot([0, Perctile], [per, per], 'k--', linewidth=1.2)
plt.plot([Perctile, Perctile], [0, per], 'k--', linewidth=1.2)
plt.text(Perctile/6, per+0.03, txt, color='b',
fontsize=10)
# 75th percentile
per=0.75
Perctile=np.quantile(data_var, per, axis=0, keepdims=True)[0]
txt= '75th percentile= '
txt+=f'{np.round(Perctile,1)}'
plt.plot([0, Perctile], [per, per], 'k--', linewidth=1.2)
plt.plot([Perctile, Perctile], [0, per], 'k--', linewidth=1.2)
plt.text(Perctile/6, per+0.03, txt, color='b',
fontsize=10)
plt.show()
font = {'size' : 12}
plt.rc('font', **font)
fig, ax1 = plt.subplots(figsize=(7, 5), dpi= 100, facecolor='w', edgecolor='k')
x=df_swing['total_votes']
y=df_swing['dem_share']
EDA_plot.CrossPlot(x,y,title='cross plot: total votes vs. dem_vote',
xlabl='total votes',ylabl='percent of dem_vote',loc=1,xlimt=(np.nanmin(x),np.nanmax(x)),
ylimt=(np.nanmin(y),np.nanmax(y)),axt=ax1,scale=1.2,alpha=0.9,markersize=6,marker='bo',font=font)
Synthetic data with 6 features, Var1 to Var6 with Gaussian, Triangular and Uniform distribution are generated to have linear and non-linear correlation with target for 100000 data.
# Generate synthetic data
df=quadrac_swiss_roll(noise=2,nsim=100000,time_rand1=1,seed=42,a1=20,b1=8,a2=50,b2=8,swiss_roll=True)
features_colums=df.columns
# Satndardize data
for i in df.columns:
df[i] = zscore(df[i])
Figure below shows cross plot matrix between Var1 to Var6 features and the target. Var1 and Var 2 are non-linearly correlated with the target. Var3 has strong positive correlation (0.68), and Var4 and Var5 are negative correlation with the target.
font = {'size' : 7}
plt.rc('font', **font)
fig, ax1 = plt.subplots(figsize=(10, 10), dpi= 100, facecolor='w', edgecolor='k')
colors_map = plt.cm.get_cmap('jet')
colors = colors_map(np.linspace(0,0.8,len(features_colums)))
for ir in range(len(features_colums)):
ax1=plt.subplot(4,2,ir+1)
val=df[features_colums[ir]]
EDA_plot.histplt(val,bins=20,title=f'{features_colums[ir]}',xlabl=None,days=False,
ylabl=None,xlimt=None,ylimt=(0,0.3)
,axt=ax1,nsplit=5,scale=1.02,loc=2,font=10.5,color=colors[len(features_colums)-ir-1])
plt.subplots_adjust(hspace=0.4)
plt.subplots_adjust(wspace=0.3)
fig.suptitle(f'Histogram of Variables for Synthetic Data', fontsize=16,y=0.96)
plt.show()
font = {'size' :9 }
plt.rc('font', **font)
fig=plt.figure(figsize=(10, 9), dpi= 100, facecolor='w', edgecolor='k')
# Satndardize data
for i in features_colums:
df[i] = zscore(df[i])
cplotmatrix(df,font=font,alpha=0.008,marker='g.',missin_rep=False)
plt.show()
Correlation Coefficient
font = {'size' : 6}
plt.rc('font', **font)
fig, ax1 = plt.subplots(figsize=(3.5, 4), dpi= 200, facecolor='w', edgecolor='k')
corr=df.corr()
corr=corr['Target'].drop(['Target'])
coefs=corr.values
features_colums=list(corr.index)
Correlation_plot.corr_bar(coefs,clmns=features_colums,select=False,yfontsize=6.0,title=f'Linear Correlation with Target',
ymax_vert_lin=30,xlim = [-0.5, 0.7])
Coefficient of Determination
The correlation coefficient also termed as $R^{2}$ is the basic approach to quantify the importance of features for a target. In statistics, $R^{2}$ determines and assesses the ability of a statistical model to explain and predict future outcomes. In other words, if we have dependent variable y and independent variable x in a model, then $R^{2}$ helps in determining the variation in y by variation x. It is one of the key output of regression analysis and is used when we want to predict future or testing some models with related information. $R^{2}$ lies between 0 and 1 and higher the value of $R^{2}$, better will be the prediction and strength of the model. $R^{2}$ is basically a square of a correlation coefficient (𝜌).
There are multiple Formulas to calculate the coefficient of determination. It can be simply calculated from correlation coefficient:
Coefficient of Determination=(Correlation Coefficient)^2
ir=0
corr=df.corr()
corr=corr['Target'].drop(['Target'])
coefs=corr.values
coefs= np.array([i**2 for i in coefs])
# Plot the importance of features
font = {'size' : 7}
plt.rc('font', **font)
fig, ax1 = plt.subplots(figsize=(6, 3), dpi= 180, facecolor='w', edgecolor='k')
_=prfrmnce_plot(coefs, title=f'Coefficient of Determination',
ylabel='Coefficient of Determination',clmns=features_colums,titlefontsize=9,
xfontsize=7, yfontsize=8).bargraph(perent=True,fontsizelable=8,xshift=-0.25,axt=ax1,
yshift=0.02,ylim=[0,1], xlim=[-0.5,5.5], y_rot=0)
Binomial distribution
In probability theory and statistics, the binomial distribution with parameters n and p is the discrete probability distribution of the number of successes in a sequence of n independent experiments, each asking a yes–no question.
The probability density for the binomial distribution is
where n is the number of trials, p is the probability of success, and N is the number of successes.
Samples=np.random.binomial(n=60,p=0.1,size=10000)
font = {'size' : 11}
plt.rc('font', **font)
fig, ax1 = plt.subplots(figsize=(8, 8), dpi= 130, facecolor='w', edgecolor='k')
title='Binomial PMF, n=60, p=0.1, size=1000'
ax1=plt.subplot(2,1,1)
EDA_plot.histplt (Samples,bins=800,title=title,xlabl=None,
ylabl='Percentage',xlimt=(0,15),ylimt=(0,0.2),axt=ax1,
scale=1.15,loc=1,font=10,color='g')
ax1=plt.subplot(2,1,2)
EDA_plot.CDF_plot(Samples,1,label='Binomial ecdf',colors='g',xlabel=None,title=title,ylabel='Cumulative Probability',bins=10000,
xlim=(0,15),ylim=(0,1),linewidth=2.5,loc=4,axt=ax1,x_ftze=12,y_ftze=12,tit_ftze=16,leg_ftze=9)
The timing of the next event is completely independent of when the previous even happened. The example are:
Poisson distribution is the limit of the Binomial distribution for low probability of success and large number of Bernoulli trial (an experiment that has two options, "success" (True) and failure (False)
).
In probability theory and statistics, the Poisson distribution is a discrete probability distribution that expresses the probability of a given number of events occurring in a fixed interval of time (or space) if these events occur with a known constant mean rate:
$\LARGE P(n)=\frac{\mu ^{n}e^{-\mu}}{n!}$
where $P(n)$ is predicted probability for $n$ number of occurrence, $\mu$ is the mean for number occurrence for a fixed interval of time
A small company, on average, is receiving 12 customers per day ($\mu=12$). What is the probability if the company will receive exactly 8 ($n=8$) customers in one day?
$\large P(n=8)=\frac{12 ^{8}e^{-12}}{8!}=0.0655$
# Draw samples from a Poisson distribution.
Samples=np.random.poisson(lam=60,size=10000)
# lam: Expected number of events occurring in a fixed-time interval,
font = {'size' : 11}
plt.rc('font', **font)
fig, ax1 = plt.subplots(figsize=(8, 8), dpi= 130, facecolor='w', edgecolor='k')
title='Poisson PMF, Expected Number of Events in Fixed Interval=60, size=10000'
ax1=plt.subplot(2,1,1)
EDA_plot.histplt (Samples,bins=1000,title=title,xlabl=None,
ylabl='Percentage',xlimt=(0,100),ylimt=None,axt=ax1,
scale=1.15,loc=1,font=9,color='g')
#
ax1=plt.subplot(2,1,2)
EDA_plot.CDF_plot(Samples,1,label='Poisson ecdf',colors='g',xlabel=None,title=title,ylabel='Cumulative Probability',bins=10000,
xlim=(0,100),ylim=None,linewidth=2.5,loc=4,axt=ax1,x_ftze=12,y_ftze=12,tit_ftze=12,leg_ftze=9)
# Draw samples from a Normal distribution.
Samples=np.random.normal(loc=3, scale=1.0, size=10000)
font = {'size' : 11}
plt.rc('font', **font)
fig, ax1 = plt.subplots(figsize=(8, 8), dpi= 130, facecolor='w', edgecolor='k')
title='Normal Distribution_loc=3, scale=1.0, size=1000'
ax1=plt.subplot(2,1,1)
EDA_plot.histplt (Samples,bins=50,title=title,xlabl=None,
ylabl='Percentage',xlimt=(0,6),ylimt=None,axt=ax1,
scale=1.15,loc=1,font=10,color='g')
ax1=plt.subplot(2,1,2)
EDA_plot.CDF_plot(Samples,1,label='Normal ecdf',colors='g',xlabel=None,title='ecdf of '+title,ylabel='Cumulative Probability',bins=10000,
xlim=(0,6),ylim=None,linewidth=2.5,loc=4,axt=ax1,x_ftze=12,y_ftze=12,tit_ftze=14,leg_ftze=9)
The waiting time between arrivals of a Poisson process is Exponentially distributed.
# Draw samples from a Exponential distribution.
Samples=np.random.exponential(scale=1.0,size=10000)
font = {'size' : 11}
plt.rc('font', **font)
fig, ax1 = plt.subplots(figsize=(8, 8), dpi= 130, facecolor='w', edgecolor='k')
title='Exponential Distribution_scale=60, size=10000'
ax1=plt.subplot(2,1,1)
EDA_plot.histplt (Samples,bins=100,title=title,xlabl=None,
ylabl='Percentage',xlimt=(0,10),ylimt=None,axt=ax1,
scale=1.15,loc=1,font=10,color='g')
ax1=plt.subplot(2,1,2)
EDA_plot.CDF_plot(Samples,1,label='Poisson ecdf',colors='g',xlabel=None,title=title,ylabel='Cumulative Probability',bins=10000,
xlim=(0,10),ylim=None,linewidth=2.5,loc=4,axt=ax1,x_ftze=12,y_ftze=12,tit_ftze=14,leg_ftze=9)
Pandas is an open source Python package for data manipulation and analysis with high-performance and easy-to-use methods. It is based on the dataframe concept in the R programming language. Pandas can be used as the primary tools for data processing, data manipulation and data cleaning. pandas is built on NumPy and Matplotlib.
# Reading data in Pandas
import pandas as pd
df = pd.read_csv("./Data/WPD.csv") # Read data file
#df.head()
df[0:5] # Display data
# shape of data
df.shape
The columns function gives the name of the columns in your data file.
column=list(df.columns)
print(column)
The info() function gives some information about the data type: string float or null (missing values).
df.info()
The describe function gives some statistical analysis for each column if the data type is not string.
df.describe()
# Components of DataFrame
df.values
# index of DataFrame
df.index
It is ideal to have valid value for every rows and columns. In reality, missing values is one of the main challenges faced by machine learning. There are some approaches in practice to deal with missing values.
Option 1: Drop Column
One naive approach is to remove columns that have missing values with drop() function.
Select_column=column[3:8] # Select columns that have missing
Select_column
df_drop_c=df.copy() # Copy to avoid modifying the original data
df_drop_c.drop(Select_column,axis=1, inplace=True) # Drop selected 5 columns
df_drop_c[:5]
df_drop_c.info()
This approach is not reliable since the information of 5 columns without missing values have been removed from data set.
Option 2: Drop Row
Another approach is to remove rows that have missing values with dropna() function.
df_drop_=df.copy() # Copy to avoid modifying the original data
df_drop_r=df_drop_.dropna() # Drop rows with missing values
df_drop_r[:5]
df_drop_r.info()
This approach is also not recommend because number of data (rows) have decreased from 1000 to 426.
Option 3: Replace with Median (mean)
A common practice is to replace missing values with the median (mean) value for that column. The median is the middle value of a list that 50% are less and 50% are bigger. See Median. The following code replaces all missing values in 5 columns with the median value of each column values.
df_im=df.copy()
# Get the median of each variable and replace the missing values with median
for i in range(len(Select_column)):
P50 = df[Select_column[i]].median() # Calculate median of each column
df_im[Select_column[i]] = df_im[Select_column[i]].fillna(P50) # replace NaN with median(P50)
df_im[:5]
df_im.info()
Dropping duplicate names
df.drop_duplicates(subset="name")
df.drop_duplicates(subset=["name1","name2"])
Outliers are usually extreme high or low values that are different from pattern of majority data. Outliers should be removed from dataset (we should be careful not to consider novelty as outlier). A straightforward approach to detect outliers are several (n) standard deviations from the mean $m\pm n\sigma$ ($m$=mean, $\sigma$=standard deviation). For example, an outlier can bigger than $m+ 3\sigma $ or less than can bigger than $m- 3\sigma $. The following function can be used to apply this approach.
def outlier_remove(df, n,name):
"""Delete rows for a specified column where values are out of +/- n*sd standard deviations
df : Pandas dataframe
n : n in the equation 𝑚±𝑛𝜎
name: Column name
"""
mean=df[name].mean() # Calclute mean of column
sd=df[name].std() # Calclute standard deviation of column
drop_r = df.index[(mean -n * sd> df[name]) | (mean+n * sd< df[name])] # Find data that are not within 𝑚±𝑛𝜎
df.drop(drop_r, axis=0, inplace=True) # Drop data
df.reset_index(inplace=True, drop=True) # Reset index
# Drop outliers in last column 'OIL Prod. (e3m3/month)'
df_im_out=df_im.copy()
outlier_remove(df_im_out,n=3,name='OIL Prod. (e3m3/month)')
df_im_out[:5]
A new data frames can be built by concatenating rows and columns. A new dataframe is created by concatenating two columns together 'Measured Depth (m)' and 'OIL Prod. (e3m3/month)' using the concat() function.
col_1= df_im['Measured Depth (m)']
col_2 = df_im['OIL Prod. (e3m3/month)']
result = pd.concat([col_1, col_2], axis=1)
result[:5]
Two rows can be concatenated together using concat() function. The code below concatenates the first 4 rows and the last 3 rows of the previous dataset after imputation.
row_1= df_im[0:4] # Retrieve first 4 rows
row_2 = df_im[-4:-1] # Retrieve last 3 rows
result = pd.concat([row_1, row_2], axis=0) # Concatenate rows
result[:5]
You can simply save a dataframe as CSV by the following code. The following code performs a shuffle and then saves a new copy.
filename="myfile.csv"
df_im.to_csv(filename, index=False) # index = False not writing row numbers
Shuffling Dataset
An example of shuffling is playing the game of cards. All the cards are collected at the end of each round of play, shuffled to ensure that cards are distributed randomly and each player receives cards based on chance. In Machine Learning, the data set should be split into training and validation datasets. So, shuffling is very important to avoid any element of bias/patterns in the split datasets.
If consistent shuffling of the data set is required for several times of running the code, a random seed can be used. See code below.
import numpy as np
#np.random.seed(82)
df_shfl = df_im.reindex(np.random.permutation(df.index))
df_shfl.reset_index(inplace=True, drop=True) # Reset index
df_shfl[0:5]
Sorting Data Set
df_sort = df_im.sort_values(by='Measured Depth (m)', ascending=True)
#df_sort.reset_index(inplace=True, drop=True) # Reset index
df_sort[:5]
we can sort values with multiple variables:
df_sort = df_im.sort_values(['Measured Depth (m)','Porosity (fraction)'], ascending=[True, False])
df_sort[:5]
Subsetting based on multiple conditions:
is_lab = df["Measured Depth (m)"] == 321.0
is_brown = df["Deviation (True/False)"] == True
df[is_lab & is_brown]
Subsetting using .isin()
df_sort[df_sort['Prod. Formation'].isin(['Sand'])][:5]
Grouping is applied to summarize data. The attributes below are used to calculate summary statistics for each col or each row. The attributes by default calculate statistics for columns:
mean()
median()
mode()
min()
max()
var()
std()
sum()
quantile()
The following codes performs grouping; generating summary statistics for each category for feature 'Prod. Formation' in target variable 'OIL Prod. (e3m3/month)'.
gb_mean=df_im.groupby('Prod. Formation')['OIL Prod. (e3m3/month)'].mean()
gb_mean
gb_sum=df_im.groupby('Prod. Formation')['OIL Prod. (e3m3/month)'].sum()
gb_sum
.agg()
Method¶The .agg()
method is used for computing costume summary statistics (df['column'].agg(function)
)
def pct90(clm):
return clm.quantile(0.9)
df_im['OIL Prod. (e3m3/month)'].agg(pct90)
df_im[['OIL Prod. (e3m3/month)','Measured Depth (m)']].agg(pct90)
def pct30(clm):
return clm.quantile(0.3)
df_im['OIL Prod. (e3m3/month)'].agg([pct30,pct90])
df_im['OIL Prod. (e3m3/month)'].iloc[:5]
df_im['OIL Prod. (e3m3/month)'].iloc[:5].cumsum()
There are other cumulative statistics:
cummax()
cummin()
cumprod()
df_im['OIL Prod. (e3m3/month)'].iloc[:5].cummax()
df_im['OIL Prod. (e3m3/month)'].iloc[:5].cummin()
df_im['OIL Prod. (e3m3/month)'].value_counts()
groupby
¶df_im.groupby('Deviation (True/False)')['Measured Depth (m)'].agg([min,max,np.median])
df_im.groupby('Deviation (True/False)')['Measured Depth (m)'].mean()
df_im.groupby('Deviation (True/False)')['Measured Depth (m)'].mean()
df_im.pivot_table(aggfunc='mean', values='Measured Depth (m)', index='Deviation (True/False)')
values
argument is the column you want to summarize and index
is the column you want to groupby.
df_im.pivot_table(aggfunc=[np.median,np.std], values='Measured Depth (m)', index='Deviation (True/False)')
df_im.columns
df_im.index
we can move a column from body of dataframe to index:
df_im[:5].set_index("X Coordinate")
To undo what we just did we can reset_ndex():
df_im[:5].reset_index(drop=True)
Indexing makes subsetting simpler:
df_im[df_im['Prod. Formation'].isin(['Shale', 'Sand'])][:5]
This is a long code, we can use subsetting as below:
#df_im.set_index("Prod. Formation",inplace=True)
df_im[:5]
df.loc[0: 3]
Multi-level indexes
df_im.set_index(["Porosity (fraction)","Permeability (Darcy)"],inplace=True)
df_im[:5]
df_im.loc[[(0.224,12.25), (0.017,0.48)]]
Sorting by index values
df_im.sort_index()[:5]
df_im.sort_index(level=['Porosity (fraction)','Permeability (Darcy)'], ascending=[True, True])[:5]
.loc
and .iloc
¶loc
does not get inner level index only it return outer level.
We can have two arguments for loc
to include columns (:
means return everything).
df_im.loc[:,"X Coordinate":"Deviation (True/False)"][:5]
iloc
method is to slice row and columns based on numbers
df_im.iloc[1:5,2:6]
Unlike loc
, last row and column is not included in iloc
.
Feature engineering is the process of extracting new feature from raw data via data mining techniques. The new features can be used to enhance the performance of machine learning algorithms. New features could also be calculated from the other fields. For the data set WPD.csv, we can add "Surface-Casing Weight (kg/m)" and "Production-Casing Weight (kg/m)" as a new data set.
df_im_c=df_im.copy()
df_im.insert(1, 'New Column', (df_im['Surface-Casing Weight (kg/m)']*df_im['Production-Casing Weight (kg/m)']).astype(int))
df_im
Most Machine Learning algorithms require to standardize the input to have a reliable comparison between two values of different variables. For example, if you have variable one with a range of 3 to 1000 and variable two with range 1 to 3; we cannot have a good comparison since the highest value of variable two (3) is the lowest value in variable two. This inconsistency in data values leads to low performance in Machine Learning algorithms. So, it is highly recommended to standardize (normalize) your data to have the same the mean($\mu$) and the standard deviation ($\sigma$) before feeding Machine Learning algorithms. One very common Machine learning standardization (normalization) is the Z-Score:
$\Large z = \frac{x - \mu}{\sigma} $
where $ \mu = \frac{1}{n} \sum_{i=1}^n x_i$, $\sigma = \sqrt{\frac{1}{n} \sum_{i=1}^n (x_i - \mu)^2}$, $n$ is number of data. This ensures that each variable have $\mu=0$ and $\sigma=1$
This standardization should be applied for all features except the target (label). For WPD.csv data set, the target is "OIL Prod. (e3m3/month)" that should not be changed.
The following Python code replaces the 'Measured Depth (m)' with a z-score. This can be done for all variables.
from scipy.stats import zscore
mean=df_im['Measured Depth (m)'].mean()
variance=df_im['Measured Depth (m)'].var()
print("Before standardization: Mean= ",int(mean), ", variance= ",int(variance))
df_im['Measured Depth (m)'] = zscore(df_im['Measured Depth (m)'])
mean=df_im['Measured Depth (m)'].mean()
variance=df_im['Measured Depth (m)'].var()
print("After standardization: Mean= ",int(mean), ", variance= ",int(variance))
df_im[0:5]
The input data for Machine Learning should be completely numeric which means any text should be converted to number. In the data set WPD.csv, the columns 'Deviation (True/False)' and 'Prod. Formation' have categorical values. Use value_counts() function to find out what categories exist and how many data belong to each category.
df_c=df_im.copy()
df_c['Deviation (True/False)'].value_counts()
df_c['Prod. Formation'].value_counts()
We can replace each test with numbers starting from 0
df_c['Deviation (True/False)']=df_c['Deviation (True/False)'].replace(False, 0) # Replace False with 0
df_c['Deviation (True/False)']=df_c['Deviation (True/False)'].replace(True, 1) # Replace True with 1
df_c[0:5]
df_c['Prod. Formation']=df_c['Prod. Formation'].replace('Shale', 0) # Replace Shale with 0
df_c['Prod. Formation']=df_c['Prod. Formation'].replace('Sand', 1) # Replace Sand with 1
df_c['Prod. Formation']=df_c['Prod. Formation'].replace('Shale-Sand', 2) # Replace Shale-Sand with 2
df_c[0:5]
One challenge with this representation is that Machine Learning algorithms consider two nearby values are more similar than two distant values. This may be fine for some cases (order of categories “bad”, “average”, “good”), but this is not always true. A common solution is to have one binary attribute per category. This is called one-hot encoding, because we only have one attribute equal to 1 (hot), while the others are 0 (cold).
One_hot = pd.get_dummies(['a','b'],prefix='Deviation')
print(One_hot)
These dummies should be merged back into the data frame.
One_hot = pd.get_dummies(df_im['Deviation (True/False)'],prefix='Deviation')
print(One_hot[0:10]) # Just show the first 10
df_one_hot = pd.concat([df_im,One_hot],axis=1)
df_one_hot[0:5]
This should be done for 'Prod. Formation' column.
One_hot = pd.get_dummies(df_im['Prod. Formation'],prefix='Formation')
df_one_hot = pd.concat([df_one_hot,One_hot],axis=1)
df_one_hot[0:5]
Finally, the original columns 'Deviation (True/False)' and 'Prod. Formation' should be removed.
df_one_hot.drop(['Deviation (True/False)','Prod. Formation'], axis=1, inplace=True)
df_one_hot