Issue
I am trying to fit a ML model and I am getting the following error:
TypeError: Feature names are only supported if all input features have string names, but your input has ['int', 'str'] as feature name / column name types. If you want feature names to be stored and validated, you must convert them all to strings, by using X.columns = X.columns.astype(str) for example. Otherwise you can remove feature / column names from your input data, or convert them all to a non-string data type.
My code:
import pandas as pd, numpy as np
import csv
import warnings
from bs4 import BeautifulSoup, MarkupResemblesLocatorWarning
from sklearn.impute import SimpleImputer
from sklearn.exceptions import ConvergenceWarning
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LinearRegression, LogisticRegression, Perceptron
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score, confusion_matrix, ConfusionMatrixDisplay
import seaborn as sns
import matplotlib.pyplot as plt
## Reading the data
train_url = 'https://github.com/Rakesh9100/ML-Project-Drug-Review-Dataset/raw/main/datasets/drugsComTrain_raw.tsv'
test_url = 'https://github.com/Rakesh9100/ML-Project-Drug-Review-Dataset/raw/main/datasets/drugsComTest_raw.tsv'
dtypes = { 'Unnamed: 0': 'int32', 'drugName': 'category', 'condition': 'category', 'review': 'category', 'rating': 'float16', 'date': 'string', 'usefulCount': 'int16' }
train_df = pd.read_csv(train_url, sep='\t', quoting=2, dtype=dtypes, parse_dates=['date'])
train_df = train_df.sample(frac=0.8, random_state=42)
test_df = pd.read_csv(test_url, sep='\t', quoting=2, dtype=dtypes, parse_dates=['date'])
## Extracting day, month, and year into separate columns
for df in [train_df, test_df]:
df['day'] = df['date'].dt.day.astype('int8')
df['month'] = df['date'].dt.month.astype('int8')
df['year'] = df['date'].dt.year.astype('int16')
## Suppressing MarkupResemblesLocatorWarning, FutureWarning and ConvergenceWarning
warnings.filterwarnings('ignore', category=MarkupResemblesLocatorWarning)
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.filterwarnings("ignore", category=ConvergenceWarning)
## Defining function to decode HTML-encoded characters
def decode_html(text):
decoded_text = BeautifulSoup(text, 'html.parser').get_text()
return decoded_text
## Applying the function to the review column
train_df['review'], test_df['review'] = train_df['review'].apply(decode_html), test_df['review'].apply(decode_html)
## Dropped the original date column and removed the useless column
train_df, test_df = [df.drop('date', axis=1).drop(df.columns[0], axis=1) for df in (train_df, test_df)]
## Handling the missing values
train_imp, test_imp = [pd.DataFrame(SimpleImputer(strategy='most_frequent').fit_transform(df)) for df in (train_df, test_df)]
## Assigning old column names
train_imp.columns = ['drugName', 'condition', 'review', 'rating', 'usefulCount', 'day', 'month', 'year']
test_imp.columns = ['drugName', 'condition', 'review', 'rating', 'usefulCount', 'day', 'month', 'year']
## Converting the text in the review column to numerical data
vectorizer = TfidfVectorizer(stop_words='english', max_features=3000)
train_reviews = vectorizer.fit_transform(train_imp['review'])
test_reviews = vectorizer.transform(test_imp['review'])
## Replacing the review column with the numerical data
train_imp.drop('review', axis=1, inplace=True)
test_imp.drop('review', axis=1, inplace=True)
train_imp = pd.concat([train_imp, pd.DataFrame(train_reviews.toarray())], axis=1)
test_imp = pd.concat([test_imp, pd.DataFrame(test_reviews.toarray())], axis=1)
## Encoding the categorical columns
for i in ["drugName", "condition"]:
train_imp[i] = LabelEncoder().fit_transform(train_imp[i])
test_imp[i] = LabelEncoder().fit_transform(test_imp[i])
## Converting the data types of columns to reduce the memory usage
train_imp, test_imp = train_imp.astype('float16'), test_imp.astype('float16')
train_imp[['drugName', 'condition', 'usefulCount', 'year']] = train_imp[['drugName', 'condition', 'usefulCount', 'year']].astype('int16')
test_imp[['drugName', 'condition', 'usefulCount', 'year']] = test_imp[['drugName', 'condition', 'usefulCount', 'year']].astype('int16')
train_imp[['rating']] = train_imp[['rating']].astype('float16')
test_imp[['rating']] = test_imp[['rating']].astype('float16')
train_imp[['day', 'month']] = train_imp[['day', 'month']].astype('int8')
test_imp[['day', 'month']] = test_imp[['day', 'month']].astype('int8')
#print(train_imp.iloc[:,:15].dtypes)
#print(test_imp.iloc[:,:15].dtypes)
## Splitting the train and test datasets into feature variables
X_train, Y_train = train_imp.drop('rating', axis=1), train_imp['rating']
X_test, Y_test = test_imp.drop('rating', axis=1), test_imp['rating']
##### LinearRegression regression algorithm #####
linear=LinearRegression()
linear.fit(X_train, Y_train)
line_train=linear.predict(X_train)
line_test=linear.predict(X_test)
The error:
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-23-e787292883ec> in <cell line: 89>()
87
88 linear=LinearRegression()
---> 89 linear.fit(X_train, Y_train)
90 line_train=linear.predict(X_train)
91 line_test=linear.predict(X_test)
3 frames
/usr/local/lib/python3.10/dist-packages/sklearn/utils/validation.py in _get_feature_names(X)
1901 # mixed type of string and non-string is not supported
1902 if len(types) > 1 and "str" in types:
-> 1903 raise TypeError(
1904 "Feature names are only supported if all input features have string names, "
1905 f"but your input has {types} as feature name / column name types. "
TypeError: Feature names are only supported if all input features have string names, but your input has ['int', 'str'] as feature name / column name types. If you want feature names to be stored and validated, you must convert them all to strings, by using X.columns = X.columns.astype(str) for example. Otherwise you can remove feature / column names from your input data, or convert them all to a non-string data type.
Solution
Just add a prefix to your TfidfVectorizer
output to convert your columns of integer to strings:
train_imp = pd.concat([train_imp, pd.DataFrame(train_reviews.toarray()).add_prefix('review')], axis=1)
test_imp = pd.concat([test_imp, pd.DataFrame(test_reviews.toarray()).add_prefix('review')], axis=1)
Output:
>>> train_imp
drugName condition rating usefulCount day month year review0 ... review2992 review2993 review2994 review2995 review2996 review2997 review2998 review2999
0 2276 816 10.0 24 26 11 2016 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
1 852 252 10.0 31 25 7 2009 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
2 1633 297 8.0 31 21 12 2011 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
3 3224 443 7.0 17 20 1 2013 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
4 915 87 10.0 20 11 4 2015 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
129033 2668 157 9.0 4 4 7 2016 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
129034 1665 157 1.0 9 6 1 2017 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
129035 1089 157 9.0 8 8 10 2016 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
129036 157 387 10.0 13 9 2 2015 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
129037 1112 157 1.0 1 14 9 2017 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
[129038 rows x 3007 columns]
Tips:
Replace:
## Handling the missing values
train_imp, test_imp = [pd.DataFrame(SimpleImputer(strategy='most_frequent').fit_transform(df)) for df in (train_df, test_df)]
## Assigning old column names
train_imp.columns = ['drugName', 'condition', 'review', 'rating', 'usefulCount', 'day', 'month', 'year']
test_imp.columns = ['drugName', 'condition', 'review', 'rating', 'usefulCount', 'day', 'month', 'year']
With:
## Handling the missing values and assigning old column names
train_imp, test_imp = [pd.DataFrame(SimpleImputer(strategy='most_frequent').fit_transform(df), columns=df.columns) for df in (train_df, test_df)]
Answered By - Corralien
0 comments:
Post a Comment
Note: Only a member of this blog may post a comment.