Issue
I am currently working on a model that reads structured data and determines if someone has a disease. I think the issue is the data is not being split between training and testing data. I am unaware of how I would be able to do that.
I am not sure what to try.
import pandas as pd
import numpy as np
import keras
from keras.models import Sequential
from keras.layers import Dense
import matplotlib.pyplot as plt
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
import seaborn as sns
from sklearn.tree import DecisionTreeClassifier
heart_data = pd.read_csv('cardio_train.csv')
heart_data.head()
heart_data.shape
heart_data.describe()
heart_data.isnull().sum()
heart_data_columns = heart_data.columns
predictors = heart_data[heart_data_columns[heart_data_columns != 'target']] # all columns except Breast Cancer
target = heart_data['target'] # Breast Cancer column
#This function returns the first n rows for the object based on position. It is useful for quickly testing if your object has the right type
predictors.head()
target.head()
#normalize the data by subtracting the mean and dividing by the standard deviation.
predictors_norm = (predictors - predictors.mean()) / predictors.std()
predictors_norm.head()
n_cols = predictors_norm.shape[1] # number of predictors
def regression_model():
# create model
model = Sequential()
#inputs
model.add(Dense(50, activation='relu', input_shape=(n_cols,)))
model.add(Dense(50, activation='relu')) # activation function
model.add(Dense(1))
# compile model
model.compile(optimizer='adam', loss='mean_squared_error')
#loss measures the results and figures out how bad it did. Optimizer generates next guess.
return model
# build the model
model = regression_model()
print (model)
# fit the model
history=model.fit(predictors_norm, target, validation_split=0.3, epochs=10, verbose=2)
#Decision Tree
print ("Processing Decision Tree")
dtc = DecisionTreeClassifier()
dtc.fit(predictors_norm,target)
print("Decision Tree Test Accuracy {:.2f}%".format(dtc.score(predictors_norm, target)*100))
#Support Vector Machine
print ("Processing Support Vector Machine")
svm = SVC(random_state = 1)
svm.fit(predictors_norm, target)
print("Test Accuracy of SVM Algorithm: {:.2f}%".format(svm.score(predictors_norm,target)*100))
#Random Forest
print ("Processing Random Forest")
rf = RandomForestClassifier(n_estimators = 1000, random_state = 1)
rf.fit(predictors_norm, target)
print("Random Forest Algorithm Accuracy Score : {:.2f}%".format(rf.score(predictors_norm,target)*100))
The message i am getting is this Decision Tree Test Accuracy 100.00% However, support vector machine is getting 73.37%
Solution
You are evaluating your model on the same data as when you trained it : you are probably overfitting. To overcome this, you must separate the data into two parts, one for learning, one for testing :
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(predictors, target, test_size=0.2)
Then, learn your model with the train dataset and evaluate it on the test dataset :
dtc = DecisionTreeClassifier()
dtc.fit(x_train, y_train)
accuracy = dtc.score(x_test, y_test) * 100
print(f"Decision Tree test accuracy : {accuracy} %.")
Answered By - Arnaud
0 comments:
Post a Comment
Note: Only a member of this blog may post a comment.