Quick Machine Learning Workflow in Python, with KNN as Example of Ionosphere Data

Multiple approaches to build models of machine learning in Python are possible, and the article would serve as a simply summary of the essential steps to conduct machine learning from data loading to final visualization.

You can find the data here: http://archive.ics.uci.edu/ml/datasets/Ionosphere

More details can be found in Robert Layton’s book here:
https://www.goodreads.com/book/show/26019855-learning-data-mining-with-python?from_search=true

###1. Load data
from sklearn.model_selection import train_test_split
import numpy as np 
import csv
datafile="ionosphere.data.csv"
X=np.zeros((351,34),dtype='float')
Y=np.zeros((351,),dtype='bool')

with open(datafile,'r') as input_file:
    reader=csv.reader(input_file)
    for i, row in enumerate(reader):
        data=[float(datum) for datum in row[:-1]]
        X[i]=data
        # 1 if the class is 'g', 0 otherwise
        Y[i]=row[-1]=='g'
print(X.shape)

###2. Observe data
import pandas as pd 
def rstr(df):
	return df.shape, df.apply(lambda x:[x.unique()])
print('\n''structure of data:''\n',rstr(pd.DataFrame(X)))
#structure of data:
# ((351, 34), 0                                          [[1.0, 0.0]]
#1                                               [[0.0]]
#2     [[0.99539, 1.0, 0.02337, 0.97588, 0.0, 0.96355...
#3     [[-0.05889, -0.18829, -0.03365, -0.45161, -0.0...
#4     [[0.85243, 0.93035, 1.0, 0.9414, -0.09924, 0.9...
#5     [[0.02306, -0.36156, 0.00485, 1.0, 0.06531, -0...
#6     [[0.83398, -0.10868, 1.0, 0.71216, 0.92106, -0...
#7     [[-0.37708, -0.93597, -0.12062, -1.0, -0.23255...
#8     [[1.0, 0.88965, 0.0, 0.77152, 0.14706, 0.85996...
#9     [[0.0376, -0.04549, 0.01198, 0.0, -0.16399, 0....
#10    [[0.85243, 0.50874, 0.73082, 0.0, 0.52798, 0.0...
#11    [[-0.17755, -0.67743, 0.05346, 0.0, -0.20275, ...
#12    [[0.59755, 0.34432, 0.85443, 0.0, 0.56409, 0.7...
#13    [[-0.44945, -0.69707, 0.00827, 0.0, -0.00712, ...
#14    [[0.60536, -0.51685, 0.54591, -1.0, 0.34395, -...
#15    [[-0.38223, -0.97515, 0.00299, 0.14516, -0.274...
#16    [[0.84356, 0.05499, 0.83775, 0.54094, 0.5294, ...
#17    [[-0.38542, -0.62237, -0.13644, -0.3933, -0.21...
#18    [[0.58212, 0.33109, 0.75535, -1.0, 0.45107, -0...
#19    [[-0.32192, -1.0, -0.0854, -0.54467, -0.17813,...
#20    [[0.56971, -0.13151, 0.70887, -0.69975, 0.0598...
#21    [[-0.29674, -0.453, -0.27502, 1.0, -0.35575, 0...
#22    [[0.36946, -0.18056, 0.43385, 0.0, 0.02309, 0....
#23    [[-0.47357, -0.35734, -0.12062, 0.0, -0.52879,...
#24    [[0.56811, -0.20332, 0.57528, 1.0, 0.03286, 0....
#25    [[-0.51171, -0.26569, -0.4022, 0.90695, -0.651...
#26    [[0.41078, -0.20468, 0.58984, 0.51613, 0.1329,...
#27    [[-0.46168, -0.18401, -0.22145, 1.0, -0.53206,...
#28    [[0.21266, -0.1904, 0.431, 1.0, 0.02431, -0.07...
#29    [[-0.3409, -0.11593, -0.17365, -0.20099, -0.62...
#30    [[0.42267, -0.16626, 0.60436, 0.25682, -0.0570...
#31    [[-0.54487, -0.06288, -0.2418, 1.0, -0.59573, ...
#32    [[0.18641, -0.13738, 0.56045, -0.32382, -0.046...
#33    [[-0.453, -0.02447, -0.38238, 1.0, -0.65697, 0...

###3. Preprocess data
from sklearn.preprocessing import MinMaxScaler
X_transformed = MinMaxScaler().fit_transform(X)

###4. Split the data
X_train, X_test, y_train, y_test = train_test_split(X_transformed, Y, random_state=14)
print("There are %s samples in the training dataset"%format(X_train.shape[0]))
print("There are {} samples in the testing dataset".format(X_test.shape[0]))
print("Each sample has %i features" %X_train.shape[1])
#There are 263 samples in the training dataset
#There are 88 samples in the testing dataset
#Each sample has 34 features

###5. Simply Fit the model
from sklearn.neighbors import KNeighborsClassifier
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')
KNN_io=KNeighborsClassifier()
KNN_io.fit(X_train,y_train)

###6. Validate if you want to
from sklearn.model_selection import cross_val_score
trans_scores = cross_val_score(KNN_io, X_transformed, Y, scoring='accuracy')
average_accuracy = np.mean(trans_scores) * 100
print("The average accuracy is {0:.1f}%".format(average_accuracy))
#The average accuracy is 82.3%

###7. Test the model
y_predicted = KNN_io.predict(X_test)
accuracy = np.mean(y_test == y_predicted) * 100
print("The accuracy is {0:.1f}%".format(accuracy))
#The accuracy is 86.4%

###8. Do it in pipeline if you want to
from sklearn.pipeline import Pipeline
scaling_pipeline = Pipeline([('scale', MinMaxScaler()),
	('predict', KNeighborsClassifier())])
pipe_scores = cross_val_score(scaling_pipeline, X, Y, scoring='accuracy')
print("The average accuracy for pipeline is {0:.1f}%"
	.format(np.mean(pipe_scores) * 100))
#The average accuracy for pipeline is 82.3%

###9. Gridsearch to tune parameters 
avg_scores = []
all_scores = []
parameter_values = list(range(1, 21))  # Including 20
for n_neighbors in parameter_values:
    KNN_grid = KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=n_neighbors, p=2,
           weights='uniform')
    grid_scores = cross_val_score(KNN_grid, X_transformed, Y, scoring='accuracy')
    avg_scores.append(np.mean(grid_scores))
    all_scores.append(grid_scores)
print("The gridsearch-based average accuracy is: ",avg_scores)
#The gridsearch-based average accuracy is:  [0.83475783475783472, 0.85754985754985746, 
#0.83760683760683763, 0.84330484330484323, 0.8233618233618234, 0.84045584045584043, 
#0.80911680911680917, 0.84045584045584043, 0.81766381766381768, 0.83190883190883191, 
#0.79487179487179482, 0.81196581196581208, 0.79487179487179482, 0.80626780626780625, 
#0.78917378917378922, 0.79202279202279202, 0.7834757834757835, 0.78917378917378922, 
#0.77777777777777779, 0.79487179487179482]

###10. Visualize
from matplotlib import pyplot as plt
plt.figure(figsize=(32,20))
plt.plot(parameter_values, avg_scores, '-o', linewidth=5, markersize=10)
plt.show()

Screen Shot 2017-06-08 at 3.24.36 PM.png

Advertisements

Leave a Reply

Fill in your details below or click an icon to log in:

WordPress.com Logo

You are commenting using your WordPress.com account. Log Out / Change )

Twitter picture

You are commenting using your Twitter account. Log Out / Change )

Facebook photo

You are commenting using your Facebook account. Log Out / Change )

Google+ photo

You are commenting using your Google+ account. Log Out / Change )

Connecting to %s