Photo by Martim Braz on Unsplash

A kind of “Hello, World!”​ in ML (using a basic workflow)

1) Define objectives

2) Collect data

# code ti retrieve file from GDrive
!pip install -U -q PyDrive

from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

# 1. Authenticate and create the PyDrive client.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)
file_list = drive.ListFile({'q': "'<folder id>' in parents and trashed=false"}).GetList()
for file1 in file_list:
print('title: %s, id: %s' % (file1['title'], file1['id']))

# create local file
house_prices_train_downloaded = drive.CreateFile({'id': '<file id>'})
house_prices_train_downloaded.GetContentFile('house_prices_train.csv')
house_prices_test_downloaded = drive.CreateFile({'id': '<file id>'})
house_prices_test_downloaded.GetContentFile('house_prices_test.csv')
# Pandas and numpy for data manipulation
import pandas as pd
import numpy as np
pd.set_option("display.max_columns",100)
# No warnings about setting value on copy of slice
pd.options.mode.chained_assignment = None
# Display up to 60 columns of a dataframe
pd.set_option('display.max_columns', 60)
# Matplotlib visualization
import matplotlib.pyplot as plt
%matplotlib inline
# Set default font size
plt.rcParams['font.size'] = 24
# Internal ipython tool for setting figure size
from IPython.core.pylabtools import figsize
# Seaborn for visualization
import seaborn as sns
sns.set(font_scale = 2)
from IPython.display import display
original_train_set = pd.read_csv('house_prices_train.csv')
display(original_train_set.info())<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
Id 1460 non-null int64
MSSubClass 1460 non-null int64
MSZoning 1460 non-null object
LotFrontage 1201 non-null float64
LotArea 1460 non-null int64
Street 1460 non-null object
Alley 91 non-null object
LotShape 1460 non-null object
LandContour 1460 non-null object
Utilities 1460 non-null object
LotConfig 1460 non-null object
LandSlope 1460 non-null object
Neighborhood 1460 non-null object
Condition1 1460 non-null object
Condition2 1460 non-null object
BldgType 1460 non-null object
HouseStyle 1460 non-null object
OverallQual 1460 non-null int64
OverallCond 1460 non-null int64
YearBuilt 1460 non-null int64
YearRemodAdd 1460 non-null int64
RoofStyle 1460 non-null object
RoofMatl 1460 non-null object
Exterior1st 1460 non-null object
Exterior2nd 1460 non-null object
MasVnrType 1452 non-null object
MasVnrArea 1452 non-null float64
ExterQual 1460 non-null object
ExterCond 1460 non-null object
Foundation 1460 non-null object
BsmtQual 1423 non-null object
BsmtCond 1423 non-null object
BsmtExposure 1422 non-null object
BsmtFinType1 1423 non-null object
BsmtFinSF1 1460 non-null int64
BsmtFinType2 1422 non-null object
BsmtFinSF2 1460 non-null int64
BsmtUnfSF 1460 non-null int64
TotalBsmtSF 1460 non-null int64
Heating 1460 non-null object
HeatingQC 1460 non-null object
CentralAir 1460 non-null object
Electrical 1459 non-null object
1stFlrSF 1460 non-null int64
2ndFlrSF 1460 non-null int64
LowQualFinSF 1460 non-null int64
GrLivArea 1460 non-null int64
BsmtFullBath 1460 non-null int64
BsmtHalfBath 1460 non-null int64
FullBath 1460 non-null int64
HalfBath 1460 non-null int64
BedroomAbvGr 1460 non-null int64
KitchenAbvGr 1460 non-null int64
KitchenQual 1460 non-null object
TotRmsAbvGrd 1460 non-null int64
Functional 1460 non-null object
Fireplaces 1460 non-null int64
FireplaceQu 770 non-null object
GarageType 1379 non-null object
GarageYrBlt 1379 non-null float64
GarageFinish 1379 non-null object
GarageCars 1460 non-null int64
GarageArea 1460 non-null int64
GarageQual 1379 non-null object
GarageCond 1379 non-null object
PavedDrive 1460 non-null object
WoodDeckSF 1460 non-null int64
OpenPorchSF 1460 non-null int64
EnclosedPorch 1460 non-null int64
3SsnPorch 1460 non-null int64
ScreenPorch 1460 non-null int64
PoolArea 1460 non-null int64
PoolQC 7 non-null object
Fence 281 non-null object
MiscFeature 54 non-null object
MiscVal 1460 non-null int64
MoSold 1460 non-null int64
YrSold 1460 non-null int64
SaleType 1460 non-null object
SaleCondition 1460 non-null object
SalePrice 1460 non-null int64
dtypes: float64(3), int64(35), object(43)
memory usage: 924.0+ KB
!pip install speedml
from speedml import Speedml
sml = Speedml('house_prices_train.csv',
'house_prices_test.csv',
target = 'SalePrice',
uid = 'Id')
Collecting speedml
Downloading https://files.pythonhosted.org/packages/b1/72/91dcc93415b09829897b3d34a87383a946b720771b6d1662fbc017782b6c/speedml-0.9.3-py2.py3-none-any.whl
Requirement already satisfied: future in /usr/local/lib/python3.6/dist-packages (from speedml) (0.16.0)
Requirement already satisfied: seaborn in /usr/local/lib/python3.6/dist-packages (from speedml) (0.7.1)
Requirement already satisfied: pandas in /usr/local/lib/python3.6/dist-packages (from speedml) (0.22.0)
Collecting sklearn (from speedml)
Downloading https://files.pythonhosted.org/packages/1e/7a/dbb3be0ce9bd5c8b7e3d87328e79063f8b263b2b1bfa4774cb1147bfcd3f/sklearn-0.0.tar.gz
Requirement already satisfied: numpy in /usr/local/lib/python3.6/dist-packages (from speedml) (1.14.3)
Requirement already satisfied: matplotlib in /usr/local/lib/python3.6/dist-packages (from speedml) (2.1.2)
Requirement already satisfied: xgboost in /usr/local/lib/python3.6/dist-packages (from speedml) (0.7.post4)
Requirement already satisfied: pytz>=2011k in /usr/local/lib/python3.6/dist-packages (from pandas->speedml) (2018.4)
Requirement already satisfied: python-dateutil>=2 in /usr/local/lib/python3.6/dist-packages (from pandas->speedml) (2.5.3)
Requirement already satisfied: scikit-learn in /usr/local/lib/python3.6/dist-packages (from sklearn->speedml) (0.19.1)
Requirement already satisfied: six>=1.10 in /usr/local/lib/python3.6/dist-packages (from matplotlib->speedml) (1.11.0)
Requirement already satisfied: pyparsing!=2.0.4,!=2.1.2,!=2.1.6,>=2.0.1 in /usr/local/lib/python3.6/dist-packages (from matplotlib->speedml) (2.2.0)
Requirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.6/dist-packages (from matplotlib->speedml) (0.10.0)
Requirement already satisfied: scipy in /usr/local/lib/python3.6/dist-packages (from xgboost->speedml) (0.19.1)
Building wheels for collected packages: sklearn
Running setup.py bdist_wheel for sklearn ... done
Stored in directory: /content/.cache/pip/wheels/76/03/bb/589d421d27431bcd2c6da284d5f2286c8e3b2ea3cf1594c074
Successfully built sklearn
Installing collected packages: sklearn, speedml
Successfully installed sklearn-0.0 speedml-0.9.32) Understand and prepare the data

3) Understand and prepare the data

3.1 Data preparation

def missing_values_table(df):        mis_val = df.isnull().sum()
mis_val_percent = 100 * df.isnull().sum() / len(df)
mis_val_table = pd.concat([mis_val, mis_val_percent], axis=1)
mis_val_table_ren_columns = mis_val_table.rename(
columns = {0 : 'Missing Values', 1 : '% '})

# Sort the table by percentage of missing descending
mis_val_table_ren_columns = mis_val_table_ren_columns[
mis_val_table_ren_columns.iloc[:,1] != 0].sort_values(
'% ', ascending=False).round(1)

return mis_val_table_ren_columns
missing_values_table(sml.train)
sml.feature.drop(['PoolQC','MiscFeature','Alley','Fence'])'Dropped 4 features with 76 features available.'
sml.feature.impute()
missing_values_table(sml.train)
display(sml.train.info())'Imputed 1558 empty values to 0.'Your selected dataframe has 76 columns.
There are 0 columns that have missing values.
Missing Values % of Total Values
<class 'pandas.core.frame.DataFrame'>
Int64Index: 1460 entries, 0 to 1459
Data columns (total 76 columns):
MSSubClass 1460 non-null int64
MSZoning 1460 non-null object
LotFrontage 1460 non-null float64
LotArea 1460 non-null int64
Street 1460 non-null object
LotShape 1460 non-null object
LandContour 1460 non-null object
Utilities 1460 non-null object
LotConfig 1460 non-null object
LandSlope 1460 non-null object
Neighborhood 1460 non-null object
Condition1 1460 non-null object
Condition2 1460 non-null object
BldgType 1460 non-null object
HouseStyle 1460 non-null object
OverallQual 1460 non-null int64
OverallCond 1460 non-null int64
YearBuilt 1460 non-null int64
YearRemodAdd 1460 non-null int64
RoofStyle 1460 non-null object
RoofMatl 1460 non-null object
Exterior1st 1460 non-null object
Exterior2nd 1460 non-null object
MasVnrType 1460 non-null object
MasVnrArea 1460 non-null float64
ExterQual 1460 non-null object
ExterCond 1460 non-null object
Foundation 1460 non-null object
BsmtQual 1460 non-null object
BsmtCond 1460 non-null object
BsmtExposure 1460 non-null object
BsmtFinType1 1460 non-null object
BsmtFinSF1 1460 non-null float64
BsmtFinType2 1460 non-null object
BsmtFinSF2 1460 non-null float64
BsmtUnfSF 1460 non-null float64
TotalBsmtSF 1460 non-null float64
Heating 1460 non-null object
HeatingQC 1460 non-null object
CentralAir 1460 non-null object
Electrical 1460 non-null object
1stFlrSF 1460 non-null int64
2ndFlrSF 1460 non-null int64
LowQualFinSF 1460 non-null int64
GrLivArea 1460 non-null int64
BsmtFullBath 1460 non-null float64
BsmtHalfBath 1460 non-null float64
FullBath 1460 non-null int64
HalfBath 1460 non-null int64
BedroomAbvGr 1460 non-null int64
KitchenAbvGr 1460 non-null int64
KitchenQual 1460 non-null object
TotRmsAbvGrd 1460 non-null int64
Functional 1460 non-null object
Fireplaces 1460 non-null int64
FireplaceQu 1460 non-null object
GarageType 1460 non-null object
GarageYrBlt 1460 non-null float64
GarageFinish 1460 non-null object
GarageCars 1460 non-null float64
GarageArea 1460 non-null float64
GarageQual 1460 non-null object
GarageCond 1460 non-null object
PavedDrive 1460 non-null object
WoodDeckSF 1460 non-null int64
OpenPorchSF 1460 non-null int64
EnclosedPorch 1460 non-null int64
3SsnPorch 1460 non-null int64
ScreenPorch 1460 non-null int64
PoolArea 1460 non-null int64
MiscVal 1460 non-null int64
MoSold 1460 non-null int64
YrSold 1460 non-null int64
SaleType 1460 non-null object
SaleCondition 1460 non-null object
SalePrice 1460 non-null int64
dtypes: float64(11), int64(26), object(39)
memory usage: 878.3+ KB

3.2 EDA

sml.train[sml.train.columns[0:]].corr()['SalePrice'][:-1].sort_values()
KitchenAbvGr -0.135907
EnclosedPorch -0.128578
MSSubClass -0.084284
OverallCond -0.077856
YrSold -0.028923
LowQualFinSF -0.025606
MiscVal -0.021190
BsmtHalfBath -0.016844
BsmtFinSF2 -0.011378
3SsnPorch 0.044584
MoSold 0.046432
PoolArea 0.092404
ScreenPorch 0.111447
BedroomAbvGr 0.168213
BsmtUnfSF 0.214479
BsmtFullBath 0.227122
LotArea 0.263843
HalfBath 0.284108
OpenPorchSF 0.315856
2ndFlrSF 0.319334
WoodDeckSF 0.324413
LotFrontage 0.334544
BsmtFinSF1 0.386420
Fireplaces 0.466929
GarageYrBlt 0.469056
MasVnrArea 0.472614
YearRemodAdd 0.507101
YearBuilt 0.522897
TotRmsAbvGrd 0.533723
FullBath 0.560664
1stFlrSF 0.605852
TotalBsmtSF 0.613581
GarageArea 0.623431
GarageCars 0.640409
GrLivArea 0.708624
OverallQual 0.790982
Name: SalePrice, dtype: float64

3.2 Feature selection

columns_of_interest = ['OverallQual','GrLivArea','GarageCars','GarageArea',
'TotalBsmtSF','1stFlrSF','FullBath','TotRmsAbvGrd',
'YearBuilt','YearRemodAdd']
sml.train.loc[:,columns_of_interest].describe()
def remove_outliers(df, columns):
for c in columns:
print('Removing outliers from ', c)
first_quartile = df[c].describe()['25%']
third_quartile = df[c].describe()['75%']
# Interquartile range
iqr = third_quartile - first_quartile
# Remove outliers
df = df[(df[c] > (first_quartile - 3 * iqr)) &
(df[c] < (third_quartile + 3 * iqr))]
return df
sml.train = remove_outliers(sml.train, columns_of_interest)sml.train.loc[:,columns_of_interest].describe()sml.train.shape
Removing outliers from OverallQual
Removing outliers from GrLivArea
Removing outliers from GarageCars
Removing outliers from GarageArea
Removing outliers from TotalBsmtSF
Removing outliers from 1stFlrSF
Removing outliers from FullBath
Removing outliers from TotRmsAbvGrd
Removing outliers from YearBuilt
Removing outliers from YearRemodAdd
_ = sns.distplot(original_train_set['SalePrice'])
def ecdf(data):
"""Compute ECDF for a one-dimensional array of measurements."""
# Number of data points: n
n = len(data)
# x-data for the ECDF: x
x = np.sort(data)
# y-data for the ECDF: y
y = np.arange(1, n+1) / n
return x, y
_ = sns.jointplot(x="GrLivArea", y="SalePrice", data=sml.train)
sml.plot.bar("OverallQual", "SalePrice")
sml.plot.bar("GarageCars", "SalePrice");
plt.show()
# Select the object columns
object_columns = sml.train.select_dtypes('object').columns
sml.train = pd.get_dummies(sml.train, columns = object_columns)
sml.train.shape(1449, 275)

3.4 Features engineering

4 Prepare the model

from sklearn.model_selection import train_test_split
features = sml.train.drop(columns='SalePrice')
targets = pd.DataFrame(sml.train['SalePrice'])
# Replace the inf and -inf with nan (required for later imputation)
features = features.replace({np.inf: np.nan, -np.inf: np.nan})
# Split into 70% training and 30% testing set
X_train, X_test, y_train, y_test = train_test_split(features, targets, test_size = 0.3,
random_state = 42)

4.1 Define a baseline

# Function to calculate mean absolute error
def mae(y_true, y_pred):
return np.mean(abs(y_true - y_pred))

baseline_guess = np.median(y_test)
print('The baseline guess is %0.2f' % baseline_guess)
print("Baseline Performance on the test set: MAE = %0.4f" % mae(y_test, baseline_guess))
The baseline guess is 163250.00
Baseline Performance on the test set: MAE = 51501.8644

4.2 Train the simplest model

from sklearn.linear_model import LinearRegressionlr = LinearRegression()

# Train the model
lr.fit(X_train, y_train)

# Make predictions and evalute
lr_pred = model.predict(X_test)
lr_mae = mae(y_test, lr_pred)
print('Linear Regression Performance on the test set: MAE = %0.4f' % lr_mae)
Linear Regression Performance on the test set: MAE = 17273.8701

4.3 Evaluate the model

_ = plt.plot(list(y_test.iloc[:,0]), marker='o', linestyle='none', 
alpha=0.2, label='real values')
_ = plt.plot(model_pred, marker='.', linestyle='none', label = 'predicted')
_ = plt.xlabel('number of samples')
_ = plt.ylabel('SalePrice')
plt.show()
ax = sns.distplot(model_pred, color='red', kde=True)
ax = sns.distplot(list(y_test.iloc[:,0]), kde=True)
ax.set(xlabel='SalePrice', ylabel='probability')

Tech consultant (antonellocalamea.com) | Avid learner | Composer | Proudly believing less is more, except for love and knowledge

Get the Medium app

A button that says 'Download on the App Store', and if clicked it will lead you to the iOS App store
A button that says 'Get it on, Google Play', and if clicked it will lead you to the Google Play store