Issue
I am trying to predict the 'Full_Time_Home_Goals
' column (feature).
I have followed the Kaggle example. The code works with the varied dimensions as in my example (419 rows in test data and 892 rows in train data)
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# %matplotlib inline
# Set option to display all the rows and columns in the dataset. If there are more rows, adjust number accordingly.
pd.set_option('display.max_rows', 5000)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
# Files
data_train = pd.read_csv(r"C:\Users\harsh\Documents\My Dream\Desktop\Machine Learning\Attempt 3\train.csv")
data_test = pd.read_csv(r"C:\Users\harsh\Documents\My Dream\Desktop\Machine Learning\Attempt 3\test.csv")
columns = ['Id', 'HomeTeam', 'AwayTeam', 'Full_Time_Home_Goals']
col = ['Id', 'HomeTeam', 'AwayTeam']
data_test = data_test[col]
data_train = data_train[columns]
data_train = data_train.dropna()
data_test = data_test.dropna()
data_train['Full_Time_Home_Goals'] = data_train['Full_Time_Home_Goals'].astype(int)
from sklearn import preprocessing
def encode_features(df_train, df_test):
features = ['HomeTeam', 'AwayTeam']
df_combined = pd.concat([df_train[features], df_test[features]])
for feature in features:
le = preprocessing.LabelEncoder()
le = le.fit(df_combined[feature])
df_train[feature] = le.transform(df_train[feature])
df_test[feature] = le.transform(df_test[feature])
return df_train, df_test
data_train, data_test = encode_features(data_train, data_test)
print(data_train.head())
print(data_test.head())
# X_all would contain all columns required for prediction and y_all would have that one columns we want to predict
X_all = data_train
y_all = data_train['Full_Time_Home_Goals']
from sklearn.model_selection import train_test_split
num_test = 0.20 # 80-20 split
X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size=num_test, random_state=23)
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import make_scorer, accuracy_score
from sklearn.model_selection import GridSearchCV
# Using Random Forest and using parameters that we defined
clf = RandomForestClassifier()
parameters = {'n_estimators': [4, 6, 9],
'max_features': ['log2', 'sqrt', 'auto'],
'criterion': ['entropy', 'gini'],
'max_depth': [2, 3, 5, 10],
'min_samples_split': [2, 3, 5],
'min_samples_leaf': [1, 5, 8]
}
acc_scorer = make_scorer(accuracy_score)
grid_obj = GridSearchCV(clf, parameters, scoring=acc_scorer)
grid_obj = grid_obj.fit(X_train, y_train)
clf = grid_obj.best_estimator_
clf.fit(X_train, y_train)
predictions = clf.predict(X_test)
The errors I am getting is :
With the code as is:
Traceback (most recent call last): File "C:/Users/harsh/PycharmProjects/Kaggle-Machine Learning from Start to Finish with Scikit-Learn/EPL Predicting.py", line 98, in predictions = clf.predict(data_test.drop('Id', axis=1)) File "C:\Users\harsh\PycharmProjects\GitHub\venv\lib\site-packages\sklearn\ensemble_forest.py", line 629, in predict ValueError: Number of features of the model must match the input. Model n_features is 4 and input n_features is 2
With the code changed from
predictions = clf.predict(data_test.drop('Id', axis=1))
to predictions = clf.predict(X_test)
, the error is:raise ValueError(msg) ValueError: array length 37921 does not match index length 380
How do I resolve this issue?
My datasets used can be found here
Solution
Below is tested and fully working code of yours:
data_train = pd.read_csv(r"train.csv")
data_test = pd.read_csv(r"test.csv")
columns = ['Id', 'HomeTeam', 'AwayTeam', 'Full_Time_Home_Goals']
col = ['Id', 'HomeTeam', 'AwayTeam']
data_test = data_test[col]
data_train = data_train[columns]
data_train = data_train.dropna()
data_test = data_test.dropna()
data_train['Full_Time_Home_Goals'] = data_train['Full_Time_Home_Goals'].astype(int)
from sklearn import preprocessing
def encode_features(df_train, df_test):
features = ['HomeTeam', 'AwayTeam']
df_combined = pd.concat([df_train[features], df_test[features]])
for feature in features:
le = preprocessing.LabelEncoder()
le = le.fit(df_combined[feature])
df_train[feature] = le.transform(df_train[feature])
df_test[feature] = le.transform(df_test[feature])
return df_train, df_test
data_train, data_test = encode_features(data_train, data_test)
print(data_train.head())
print(data_test.head())
# X_all would contain all columns required for prediction and y_all would have that one columns we want to predict
y_all = data_train['Full_Time_Home_Goals']
X_all = data_train.drop(['Full_Time_Home_Goals'], axis=1)
from sklearn.model_selection import train_test_split
num_test = 0.20 # 80-20 split
X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size=num_test, random_state=23)
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import make_scorer, accuracy_score
from sklearn.model_selection import GridSearchCV
# Using Random Forest and using parameters that we defined
clf = RandomForestClassifier()
parameters = {'n_estimators': [4, 6, 9],
'max_features': ['log2', 'sqrt', 'auto'],
'criterion': ['entropy', 'gini'],
'max_depth': [2, 3, 5, 10],
'min_samples_split': [2, 3, 5],
'min_samples_leaf': [1, 5, 8]
}
acc_scorer = make_scorer(accuracy_score)
grid_obj = GridSearchCV(clf, parameters, scoring=acc_scorer)
grid_obj = grid_obj.fit(X_train, y_train)
clf = grid_obj.best_estimator_
clf.fit(X_train, y_train)
predictions = clf.predict(X_test)
print(accuracy_score(y_test, predictions))
ids = data_test['Id']
predictions = clf.predict(data_test)
df_preds = pd.DataFrame({"id":ids, "predictions":predictions})
df_preds
Id HomeTeam AwayTeam Full_Time_Home_Goals
0 1 55 440 3
1 2 158 493 2
2 3 178 745 1
3 4 185 410 1
4 5 249 57 2
Id HomeTeam AwayTeam
0 190748 284 54
1 190749 124 441
2 190750 446 57
3 190751 185 637
4 190752 749 482
0.33213786556261704
id predictions
0 190748 1
1 190749 1
2 190750 1
3 190751 1
4 190752 1
... ... ...
375 191123 1
376 191124 1
377 191125 1
378 191126 1
379 191127 1
380 rows × 2 columns
Answered By - Sergey Bushmanov
0 comments:
Post a Comment
Note: Only a member of this blog may post a comment.