Issue
I have a preprocessing script that takes data from a diamonds dataset and preprocesses the data. I obviously need it to preprocess labels as well.
Here is my code:
# Data Preprocessing
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from icecream import ic
def diamond_preprocess(data_dir):
data = pd.read_csv(data_dir)
cleaned_data = data.drop(['id', 'depth_percent'], axis=1) # Features I don't want
x = cleaned_data.drop(['price'], axis=1) # Train data
y = cleaned_data['price'] # Label data
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=99)
numerical_features = x_train.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_features = x_train.select_dtypes(include=['object']).columns.tolist()
numerical_transformer = Pipeline(steps=[
('imputer', SimpleImputer(strategy='median')), # Fill in missing data with median
('scaler', StandardScaler()) # Scale data
])
categorical_transformer = Pipeline(steps=[
('imputer', SimpleImputer(strategy='constant', fill_value='missing')), # Fill in missing data with 'missing'
('onehot', OneHotEncoder(handle_unknown='ignore')) # One hot encode categorical data
])
preprocessor_pipeline = ColumnTransformer(
transformers=[
('num', numerical_transformer, numerical_features),
('cat', categorical_transformer, categorical_features)
])
# Fit to the training data
preprocessor_pipeline.fit(x_train)
preprocessor_pipeline.fit(y_train)
# Apply the pipeline to the training and test data
x_train_pipe = preprocessor_pipeline.transform(x_train)
x_test_pipe = preprocessor_pipeline.transform(x_test)
y_train_pipe = preprocessor_pipeline.transform(y_train)
y_test_pipe = preprocessor_pipeline.transform(y_test)
x_train = pd.DataFrame(data=x_train_pipe)
x_test = pd.DataFrame(data=x_test_pipe)
y_train = pd.DataFrame(data=y_train_pipe)
y_test = pd.DataFrame(data=y_test_pipe)
return x_train, x_test, y_train, y_test
I am not very confident that my code is correct or that I have a good understanding of how pipelines and preprocessing works in sklearn. Apparently, the interpreter agrees as I get this error:
File "C:\Users\17574\Anaconda3\envs\kraken-gpu\lib\site-packages\sklearn\compose\_column_transformer.py", line 470, in fit
self.fit_transform(X, y=y)
File "C:\Users\17574\Anaconda3\envs\kraken-gpu\lib\site-packages\sklearn\compose\_column_transformer.py", line 502, in fit_transform
self._check_n_features(X, reset=True)
File "C:\Users\17574\Anaconda3\envs\kraken-gpu\lib\site-packages\sklearn\base.py", line 352, in _check_n_features
n_features = X.shape[1]
IndexError: tuple index out of range
How do I properly preprocess my labels like I did with my training data? An explanation would be great as well!
Solution
You can create an additional pipeline for your target column if you want to apply the transformations separately, see the example below.
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
# generate the data
data = pd.DataFrame({
'y': [1, 2, np.nan, 4, 5],
'x1': [6, 7, 8, np.nan, np.nan],
'x2': [9, 10, 11, np.nan, np.nan],
'x3': ['a', 'b', 'c', np.nan, np.nan],
'x4': [np.nan, np.nan, 'd', 'e', 'f']
})
# extract the features and target
x = data.drop(labels=['y'], axis=1)
y = data[['y']] # note that this is a data frame, not a series
# split the data
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.5, random_state=99)
# map the features to the corresponding types (numerical or categorical)
numerical_features = x_train.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_features = x_train.select_dtypes(include=['object']).columns.tolist()
# define the features pipeline
numerical_features_transformer = Pipeline(steps=[
('imputer', SimpleImputer(strategy='median')),
('scaler', StandardScaler())
])
categorical_features_transformer = Pipeline(steps=[
('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
('onehot', OneHotEncoder(handle_unknown='ignore'))
])
features_pipeline = ColumnTransformer(transformers=[
('num_features', numerical_features_transformer, numerical_features),
('cat_features', categorical_features_transformer, categorical_features)
])
# define the target pipeline
target_pipeline = Pipeline(steps=[
('imputer', SimpleImputer(strategy='mean')),
('scaler', StandardScaler())
])
# fit the pipelines to the training data
features_pipeline.fit(x_train)
target_pipeline.fit(y_train)
# apply the pipelines to the training and test data
x_train_pipe = features_pipeline.transform(x_train)
x_test_pipe = features_pipeline.transform(x_test)
y_train_pipe = target_pipeline.transform(y_train)
y_test_pipe = target_pipeline.transform(y_test)
x_train = pd.DataFrame(data=x_train_pipe)
x_test = pd.DataFrame(data=x_test_pipe)
y_train = pd.DataFrame(data=y_train_pipe)
y_test = pd.DataFrame(data=y_test_pipe)
Answered By - Flavia Giammarino
0 comments:
Post a Comment
Note: Only a member of this blog may post a comment.