Python

Libraries & Defaults

import numpy as np

import pandas as pd

import seaborn as sns;
sns.set();
sns.set_style("whitegrid");

# to display all the columns of the dataframe in the notebook
pd.pandas.set_option('display.max_columns', None)

# Show a chart created using matplotlib directly under the code that produces it
%matplotlib inline

# Import pyplot from the matplotlib library, for creating chart
from matplotlib import pyplot as plt

# Configure the aesthetics of the charts
plt.rcParams['figure.figsize'] = (10, 8)

Import Data

df = pd.read_csv('my_file.csv')

Initial Look at Data

len(df)

df.shape()

df.columns

df.dtypes

df.head()

df.tail()

df['col'].unique()

df.sample()

df.describe(include='all')

df['col'].describe()

df.info()

# make a list of the variables that contain missing values
vars_with_na = [var for var in df.columns if df[var].isnull().sum()>1]

# print the variable name and the percentage of missing values
for var in vars_with_na:
print(var, np.round(df[var].isnull().mean(), 3), ' % missing values')

def analyse_na_value(df, var):
df = df.copy()

# let's make a variable that indicates 1 if the observation was missing or zero otherwise
df[var] = np.where(df[var].isnull(), 1, 0)

# let's calculate the median SalePrice (our target feature) where the information is missing or present
df.groupby(var)['SalePrice'].median().plot.bar()
plt.title(var)
plt.show()

for var in vars_with_na:
analyse_na_value(data, var)

# get a list of variables that are numerical
num_vars = [var for var in data.columns if data[var].dtypes != 'O']

print('Number of numerical variables: ', len(num_vars))

# view the data for the first few rows for the numerical variables
data[num_vars].head()

### Categorical variables
cat_vars = [var for var in data.columns if data[var].dtypes=='O']
print('Number of categorical variables: ', len(cat_vars))

# how many category values per category feature?
for var in cat_vars:
     print(var, len(data[var].unique()), ' categories')

# Find category values that make up less than 1% of the observations i.e. Rare Labels
# Labels that are under-represented in the dataset tend to cause over-fitting of machine learning models.
# That is why we want to remove them.
def analyse_rare_labels(df, var, rare_perc):
     df = df.copy()
     tmp = df.groupby(var)['SalePrice'].count() / len(df)
     return tmp[tmp < rare_perc]

for var in cat_vars:
     print(analyse_rare_labels(data, var, 0.01))
     print()

Clean and Transform Data

df2 = df1.copy()

df.T

df['col'].astype('category')

df.rename( columns = {'old_name1':'new_name1', 'old_name2':'new_name2'}, inplace=True )

df['col'].map( {1: 'Diabetic', 0:'Non-Diabetic'} )

pd.cut( df['col'],
         bins=[0, 3, 5,...],
         labels=['low', 'med', 'high', ...],
         right=False,
         include_lowest=True
       )

df = df.drop(columns=['col1', 'col2'])

df = df.drop( [0, 1] )    <-- drop rows with index 0 and index 1

df.dropna()    <-- drop rows with any cells containing NaN

df.fillna(' ')

df.sort_values(by='col', ascending=False)

df.sort_index(ascending=False)    <-- useful for sorting a series.

df.loc[ (df['col1']=='value1') & (df['col2']<='value2'), ['col1', 'col2', 'col3']]

totals = df.loc[
             'Ancoats':'Woodhouse',
             'Searching for Work':'Preparing for work'
          ].sum(axis='rows')    <-- create totals for a range of columns

df.iloc[[0,1], 9:15]

df['col'].value_counts(normalize=True)    <-- normalize=True gives proportions rather than integer counts

.isnull()

.notnull()

.isin( ['value1', 'value2'] )

~    <-- reverses the boolean logic test of what comes after this (like 'NOT')

.between(20, 35)

.str.contains('blah')

.str.startswith('blah')

.str.endswith('blah')

.sum()

.count()

.mean()

.median()

.min()

.max()

.quantile( [0.25, 0.75] )

.std()

.var()

.abs()

.groupby('col').size()

.groupby('col').mean()

pd.merge(df1, df2, how='outer')

df1.columns & df2.columns    <-- find columns in common

df1.merge(df2, on='col')

df1.merge(df2, left_on='col1', right_on='col2')

df1.merge(df2, left_on='col1', right_index=True)

df1\
.merge(df2, on='col')\
.merge(df3, on='col2')\
.merge(df4.query(' col3 == "value1" '), on='col4' )\
.sort_values('col5')\
.tail(10)\
[['colX', 'colY', 'colZ']]

# function to replace missing values with 'Missing' in categorical variables
def fill_categorical_na(df, var_list):
X = df.copy()
X[var_list] = df[var_list].fillna('Missing')
return X

# replace missing values with new label: "Missing"
X_train = fill_categorical_na(X_train, vars_with_na)
X_test = fill_categorical_na(X_test, vars_with_na)

# check that we have no missing information in the engineered variables
X_train[vars_with_na].isnull().sum()

# check that test set does not contain null values in the engineered variables
[vr for var in vars_with_na if X_train[var].isnull().sum()>0]

# For numerical variables with missing values, add an additional variable capturing the missing information,
# and then replace the missing information in the original variable by the mode, or most frequent value:
# Make a list of the numerical variables that contain missing values
vars_with_na = [var for var in data.columns if X_train[var].isnull().sum()>1 and X_train[var].dtypes!='O']

# print the variable name and the percentage of missing values
for var in vars_with_na:
print(var, np.round(X_train[var].isnull().mean(), 3), ' % missing values')

# replace the missing values
for var in vars_with_na:
   # calculate the mode
   mode_val = X_train[var].mode()[0]

   # train
   X_train[var+'_na'] = np.where(X_train[var].isnull(), 1, 0)
   X_train[var].fillna(mode_val, inplace=True)

   # test
   X_test[var+'_na'] = np.where(X_test[var].isnull(), 1, 0)
   X_test[var].fillna(mode_val, inplace=True)

# check that we have no more missing values in the engineered variables
X_train[vars_with_na].isnull().sum()

# log transform the numerical variables that do not contain zeros in order to get a more Gaussian-like distribution.
# This tends to help Linear machine learning models.
for var in ['LotFrontage', 'LotArea', '1stFlrSF', 'GrLivArea', 'SalePrice']:
X_train[var] = np.log(X_train[var])
X_test[var]= np.log(X_test[var])

# check that test set does not contain null values in the engineered variables
[var for var in ['LotFrontage', 'LotArea', '1stFlrSF', 'GrLivArea', 'SalePrice'] if X_test[var].isnull().sum()>0]

# same for train set
[var for var in ['LotFrontage', 'LotArea', '1stFlrSF', 'GrLivArea', 'SalePrice'] if X_train[var].isnull().sum()>0]

# deal with categories in variables that are present in less than 1% of the observations:
# capture the categorical variables first
cat_vars = [var for var in X_train.columns if X_train[var].dtype == 'O']

def find_frequent_labels(df, var, rare_perc):
# finds the labels that are shared by more than a certain % of the houses in the dataset
df = df.copy()
tmp = df.groupby(var)['SalePrice'].count() / len(df)
return tmp[tmp>rare_perc].index

# if the label is frequent then leave it alone, otherwise assign it to a new label called 'Rare'
# this also will deal with any new data values that come along in production that are rare
for var in cat_vars:
frequent_ls = find_frequent_labels(X_train, var, 0.01)
X_train[var] = np.where(X_train[var].isin(frequent_ls), X_train[var], 'Rare')
X_test[var] = np.where(X_test[var].isin(frequent_ls), X_test[var], 'Rare')

Visualise Data

df['col'].value_counts().plot.barh()

df['col'].value_counts().plot.barh(color='#deaa02', alpha=0.8, width=0.9);
plt.xlabel('Number of UK Python Users');
plt.rcParams['xtick.labelsize']=14
plt.rcParams['ytick.labelsize']=12
plt.rcParams['axes.labelsize'] = 18
plt.rcParams['axes.grid'] = False
plt.rcParams['axes.titlesize'] = 14
plt.rcParams['ytick.major.pad']= 4
plt.rcParams['xtick.major.pad']= 4
plt.rcParams['axes.titlepad'] = 10
plt.rcParams['axes.labelpad'] = 10

df.groupby( ['col1', 'col2'] )['col3']\
.mean()\
.unstack('col2')\
.loc[:, col2val1:col2val2]\
.plot.bar()

.plot.scatter(x='', y='')

df['col'].plot.box()

df['col'].plot.hist()

df['col'].plot.kde()

sns.distplot(df['col']);

sns.boxplot(df['col']);