Python
Libraries & Defaults
import numpy as np
import pandas as pd
import seaborn as sns;
sns.set();
sns.set_style("whitegrid");
# to display all the columns of the dataframe in the notebook
pd.pandas.set_option('display.max_columns', None)
# Show a chart created using matplotlib directly under the code that produces it
%matplotlib inline
# Import pyplot from the matplotlib library, for creating chart
from matplotlib import pyplot as plt
# Configure the aesthetics of the charts
plt.rcParams['figure.figsize'] = (10, 8)
Import Data
df = pd.read_csv('my_file.csv')
Initial Look at Data
len(df)
df.shape()
df.columns
df.dtypes
df.head()
df.tail()
df['col'].unique()
df.sample()
df.describe(include='all')
df['col'].describe()
df.info()
# make a list of the variables that contain missing values
vars_with_na = [var for var in df.columns if df[var].isnull().sum()>1]
# print the variable name and the percentage of missing values
for var in vars_with_na:
print(var, np.round(df[var].isnull().mean(), 3), ' % missing values')
def analyse_na_value(df, var):
df = df.copy()
# let's make a variable that indicates 1 if the observation was missing or zero otherwise
df[var] = np.where(df[var].isnull(), 1, 0)
# let's calculate the median SalePrice (our target feature) where the information is missing or present
df.groupby(var)['SalePrice'].median().plot.bar()
plt.title(var)
plt.show()
for var in vars_with_na:
analyse_na_value(data, var)
# get a list of variables that are numerical
num_vars = [var for var in data.columns if data[var].dtypes != 'O']
print('Number of numerical variables: ', len(num_vars))
# view the data for the first few rows for the numerical variables
data[num_vars].head()
### Categorical variables
cat_vars = [var for var in data.columns if data[var].dtypes=='O']
print('Number of categorical variables: ', len(cat_vars))
# how many category values per category feature?
for var in cat_vars:
print(var, len(data[var].unique()), ' categories')
# Find category values that make up less than 1% of the observations i.e. Rare Labels
# Labels that are under-represented in the dataset tend to cause over-fitting of machine learning models.
# That is why we want to remove them.
def analyse_rare_labels(df, var, rare_perc):
df = df.copy()
tmp = df.groupby(var)['SalePrice'].count() / len(df)
return tmp[tmp < rare_perc]
for var in cat_vars:
print(analyse_rare_labels(data, var, 0.01))
print()
Clean and Transform Data
df2 = df1.copy()
df.T
df['col'].astype('category')
df.rename( columns = {'old_name1':'new_name1', 'old_name2':'new_name2'}, inplace=True )
df['col'].map( {1: 'Diabetic', 0:'Non-Diabetic'} )
pd.cut( df['col'],
bins=[0, 3, 5,...],
labels=['low', 'med', 'high', ...],
right=False,
include_lowest=True
)
df = df.drop(columns=['col1', 'col2'])
df = df.drop( [0, 1] ) <-- drop rows with index 0 and index 1
df.dropna() <-- drop rows with any cells containing NaN
df.fillna(' ')
df.sort_values(by='col', ascending=False)
df.sort_index(ascending=False) <-- useful for sorting a series.
df.loc[ (df['col1']=='value1') & (df['col2']<='value2'), ['col1', 'col2', 'col3']]
totals = df.loc[
'Ancoats':'Woodhouse',
'Searching for Work':'Preparing for work'
].sum(axis='rows') <-- create totals for a range of columns
df.iloc[[0,1], 9:15]
df['col'].value_counts(normalize=True) <-- normalize=True gives proportions rather than integer counts
.isnull()
.notnull()
.isin( ['value1', 'value2'] )
~ <-- reverses the boolean logic test of what comes after this (like 'NOT')
.between(20, 35)
.str.contains('blah')
.str.startswith('blah')
.str.endswith('blah')
.sum()
.count()
.mean()
.median()
.min()
.max()
.quantile( [0.25, 0.75] )
.std()
.var()
.abs()
.groupby('col').size()
.groupby('col').mean()
pd.merge(df1, df2, how='outer')
df1.columns & df2.columns <-- find columns in common
df1.merge(df2, on='col')
df1.merge(df2, left_on='col1', right_on='col2')
df1.merge(df2, left_on='col1', right_index=True)
df1\
.merge(df2, on='col')\
.merge(df3, on='col2')\
.merge(df4.query(' col3 == "value1" '), on='col4' )\
.sort_values('col5')\
.tail(10)\
[['colX', 'colY', 'colZ']]
# function to replace missing values with 'Missing' in categorical variables
def fill_categorical_na(df, var_list):
X = df.copy()
X[var_list] = df[var_list].fillna('Missing')
return X
# replace missing values with new label: "Missing"
X_train = fill_categorical_na(X_train, vars_with_na)
X_test = fill_categorical_na(X_test, vars_with_na)
# check that we have no missing information in the engineered variables
X_train[vars_with_na].isnull().sum()
# check that test set does not contain null values in the engineered variables
[vr for var in vars_with_na if X_train[var].isnull().sum()>0]
# For numerical variables with missing values, add an additional variable capturing the missing information,
# and then replace the missing information in the original variable by the mode, or most frequent value:
# Make a list of the numerical variables that contain missing values
vars_with_na = [var for var in data.columns if X_train[var].isnull().sum()>1 and X_train[var].dtypes!='O']
# print the variable name and the percentage of missing values
for var in vars_with_na:
print(var, np.round(X_train[var].isnull().mean(), 3), ' % missing values')
# replace the missing values
for var in vars_with_na:
# calculate the mode
mode_val = X_train[var].mode()[0]
# train
X_train[var+'_na'] = np.where(X_train[var].isnull(), 1, 0)
X_train[var].fillna(mode_val, inplace=True)
# test
X_test[var+'_na'] = np.where(X_test[var].isnull(), 1, 0)
X_test[var].fillna(mode_val, inplace=True)
# check that we have no more missing values in the engineered variables
X_train[vars_with_na].isnull().sum()
# log transform the numerical variables that do not contain zeros in order to get a more Gaussian-like distribution.
# This tends to help Linear machine learning models.
for var in ['LotFrontage', 'LotArea', '1stFlrSF', 'GrLivArea', 'SalePrice']:
X_train[var] = np.log(X_train[var])
X_test[var]= np.log(X_test[var])
# check that test set does not contain null values in the engineered variables
[var for var in ['LotFrontage', 'LotArea', '1stFlrSF', 'GrLivArea', 'SalePrice'] if X_test[var].isnull().sum()>0]
# same for train set
[var for var in ['LotFrontage', 'LotArea', '1stFlrSF', 'GrLivArea', 'SalePrice'] if X_train[var].isnull().sum()>0]
# deal with categories in variables that are present in less than 1% of the observations:
# capture the categorical variables first
cat_vars = [var for var in X_train.columns if X_train[var].dtype == 'O']
def find_frequent_labels(df, var, rare_perc):
# finds the labels that are shared by more than a certain % of the houses in the dataset
df = df.copy()
tmp = df.groupby(var)['SalePrice'].count() / len(df)
return tmp[tmp>rare_perc].index
# if the label is frequent then leave it alone, otherwise assign it to a new label called 'Rare'
# this also will deal with any new data values that come along in production that are rare
for var in cat_vars:
frequent_ls = find_frequent_labels(X_train, var, 0.01)
X_train[var] = np.where(X_train[var].isin(frequent_ls), X_train[var], 'Rare')
X_test[var] = np.where(X_test[var].isin(frequent_ls), X_test[var], 'Rare')
Visualise Data
df['col'].value_counts().plot.barh()
df['col'].value_counts().plot.barh(color='#deaa02', alpha=0.8, width=0.9);
plt.xlabel('Number of UK Python Users');
plt.rcParams['xtick.labelsize']=14
plt.rcParams['ytick.labelsize']=12
plt.rcParams['axes.labelsize'] = 18
plt.rcParams['axes.grid'] = False
plt.rcParams['axes.titlesize'] = 14
plt.rcParams['ytick.major.pad']= 4
plt.rcParams['xtick.major.pad']= 4
plt.rcParams['axes.titlepad'] = 10
plt.rcParams['axes.labelpad'] = 10
df.groupby( ['col1', 'col2'] )['col3']\
.mean()\
.unstack('col2')\
.loc[:, col2val1:col2val2]\
.plot.bar()
.plot.scatter(x='', y='')
df['col'].plot.box()
df['col'].plot.hist()
df['col'].plot.kde()
sns.distplot(df['col']);
sns.boxplot(df['col']);