Sunday, October 24, 2021

[FIXED] Predicting numerical features based on string features using sk-learn

October 24, 2021 encoder, python, scikit-learn No comments

Issue

I am trying to predict the 'Full_Time_Home_Goals' column (feature). I have followed the Kaggle example. The code works with the varied dimensions as in my example (419 rows in test data and 892 rows in train data)

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# %matplotlib inline

# Set option to display all the rows and columns in the dataset. If there are more rows, adjust number accordingly.
pd.set_option('display.max_rows', 5000)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

# Files
data_train = pd.read_csv(r"C:\Users\harsh\Documents\My Dream\Desktop\Machine Learning\Attempt 3\train.csv")
data_test = pd.read_csv(r"C:\Users\harsh\Documents\My Dream\Desktop\Machine Learning\Attempt 3\test.csv")


columns = ['Id', 'HomeTeam', 'AwayTeam', 'Full_Time_Home_Goals']
col = ['Id', 'HomeTeam', 'AwayTeam']
data_test = data_test[col]
data_train = data_train[columns]

data_train = data_train.dropna()
data_test = data_test.dropna()

data_train['Full_Time_Home_Goals'] = data_train['Full_Time_Home_Goals'].astype(int)

from sklearn import preprocessing


def encode_features(df_train, df_test):
    features = ['HomeTeam', 'AwayTeam']
    df_combined = pd.concat([df_train[features], df_test[features]])

    for feature in features:
        le = preprocessing.LabelEncoder()
        le = le.fit(df_combined[feature])
        df_train[feature] = le.transform(df_train[feature])
        df_test[feature] = le.transform(df_test[feature])
    return df_train, df_test


data_train, data_test = encode_features(data_train, data_test)
print(data_train.head())
print(data_test.head())

# X_all would contain all columns required for prediction and y_all would have that one columns we want to predict

X_all = data_train

y_all = data_train['Full_Time_Home_Goals']

from sklearn.model_selection import train_test_split

num_test = 0.20  # 80-20 split
X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size=num_test, random_state=23)

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import make_scorer, accuracy_score
from sklearn.model_selection import GridSearchCV

# Using Random Forest and using parameters that we defined

clf = RandomForestClassifier()

parameters = {'n_estimators': [4, 6, 9],
              'max_features': ['log2', 'sqrt', 'auto'],
              'criterion': ['entropy', 'gini'],
              'max_depth': [2, 3, 5, 10],
              'min_samples_split': [2, 3, 5],
              'min_samples_leaf': [1, 5, 8]
              }

acc_scorer = make_scorer(accuracy_score)

grid_obj = GridSearchCV(clf, parameters, scoring=acc_scorer)
grid_obj = grid_obj.fit(X_train, y_train)

clf = grid_obj.best_estimator_

clf.fit(X_train, y_train)

predictions = clf.predict(X_test)

The errors I am getting is :

With the code as is:

Traceback (most recent call last): File "C:/Users/harsh/PycharmProjects/Kaggle-Machine Learning from Start to Finish with Scikit-Learn/EPL Predicting.py", line 98, in predictions = clf.predict(data_test.drop('Id', axis=1)) File "C:\Users\harsh\PycharmProjects\GitHub\venv\lib\site-packages\sklearn\ensemble_forest.py", line 629, in predict ValueError: Number of features of the model must match the input. Model n_features is 4 and input n_features is 2
With the code changed from predictions = clf.predict(data_test.drop('Id', axis=1)) to predictions = clf.predict(X_test), the error is:
```
 raise ValueError(msg) ValueError: array length 37921 does not match index length 380
```

How do I resolve this issue?

My datasets used can be found here

Solution

Below is tested and fully working code of yours:

data_train = pd.read_csv(r"train.csv")
data_test = pd.read_csv(r"test.csv")


columns = ['Id', 'HomeTeam', 'AwayTeam', 'Full_Time_Home_Goals']
col = ['Id', 'HomeTeam', 'AwayTeam']
data_test = data_test[col]
data_train = data_train[columns]

data_train = data_train.dropna()
data_test = data_test.dropna()

data_train['Full_Time_Home_Goals'] = data_train['Full_Time_Home_Goals'].astype(int)

from sklearn import preprocessing


def encode_features(df_train, df_test):
    features = ['HomeTeam', 'AwayTeam']
    df_combined = pd.concat([df_train[features], df_test[features]])

    for feature in features:
        le = preprocessing.LabelEncoder()
        le = le.fit(df_combined[feature])
        df_train[feature] = le.transform(df_train[feature])
        df_test[feature] = le.transform(df_test[feature])
    return df_train, df_test


data_train, data_test = encode_features(data_train, data_test)
print(data_train.head())
print(data_test.head())

# X_all would contain all columns required for prediction and y_all would have that one columns we want to predict

y_all = data_train['Full_Time_Home_Goals']
X_all = data_train.drop(['Full_Time_Home_Goals'], axis=1)

from sklearn.model_selection import train_test_split

num_test = 0.20  # 80-20 split
X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size=num_test, random_state=23)

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import make_scorer, accuracy_score
from sklearn.model_selection import GridSearchCV

# Using Random Forest and using parameters that we defined

clf = RandomForestClassifier()

parameters = {'n_estimators': [4, 6, 9],
              'max_features': ['log2', 'sqrt', 'auto'],
              'criterion': ['entropy', 'gini'],
              'max_depth': [2, 3, 5, 10],
              'min_samples_split': [2, 3, 5],
              'min_samples_leaf': [1, 5, 8]
              }

acc_scorer = make_scorer(accuracy_score)

grid_obj = GridSearchCV(clf, parameters, scoring=acc_scorer)
grid_obj = grid_obj.fit(X_train, y_train)

clf = grid_obj.best_estimator_

clf.fit(X_train, y_train)

predictions = clf.predict(X_test)

print(accuracy_score(y_test, predictions))

ids = data_test['Id']
predictions = clf.predict(data_test)

df_preds = pd.DataFrame({"id":ids, "predictions":predictions})
df_preds

   Id  HomeTeam  AwayTeam  Full_Time_Home_Goals
0   1        55       440                     3
1   2       158       493                     2
2   3       178       745                     1
3   4       185       410                     1
4   5       249        57                     2
       Id  HomeTeam  AwayTeam
0  190748       284        54
1  190749       124       441
2  190750       446        57
3  190751       185       637
4  190752       749       482
0.33213786556261704
id  predictions
0   190748  1
1   190749  1
2   190750  1
3   190751  1
4   190752  1
... ... ...
375 191123  1
376 191124  1
377 191125  1
378 191126  1
379 191127  1
380 rows × 2 columns

Answered By - Sergey Bushmanov

This Answer collected from stackoverflow and tested by PythonFixing community admins, is licensed under cc by-sa 2.5 , cc by-sa 3.0 and cc by-sa 4.0

Sunday, October 24, 2021

[FIXED] Predicting numerical features based on string features using sk-learn

Issue

Solution

0 comments:

Post a Comment

Popular Posts

Labels