Issue
Trying to get a prediction using my decision tree model gives the titular error on the final line of code.
X=BTC_cleanData[-1:]
---> print(regressor.predict(X))
ValueError: matmul: Input operand 1 has a mismatch in its core dimension 0, with gufunc signature (n?,k),
(k,m?)->(n?,m?) (size 145 is different from 146)
As far as I can tell, I've successfully trained and tested the model, but I'm doing something wrong when I attempt to output a prediction. I think something about the way I am defining the target to be predicted is adding a column to a matrix somewhere, hence matmul error. How do I write a prediction function that works?
here's the full code, I've left out feature selection as its very long:
import pandas as pd
import numpy as np
import talib
import matplotlib.pyplot as plt
%matplotlib inline
import investpy
from investpy import data
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
#Import open, high, low, close, volume and Return data from csv using investpy
BTC = data = investpy.get_crypto_historical_data(crypto='bitcoin', from_date='01/01/2014', to_date='06/08/2020')
#Convert Data from Int to Float
BTC.Volume = BTC.Volume.astype(float)
BTC.High = BTC.High.astype(float)
BTC.Low = BTC.Low.astype(float)
BTC.Close = BTC.Close.astype(float)
#Drop Unnecessary Columns
del BTC['Currency']
#Select Indicators as Features
BTC['AD'] = talib.AD(BTC['High'].values, BTC['Low'].values, BTC['Close'].values, BTC['Volume'].values)
...(there is a long list here)
#Create forward looking columns using shift
BTC['NextDayPrice'] = BTC['Close'].shift(-1)
#Copy dataframe and clean data
BTC_cleanData = BTC.copy()
BTC_cleanData.dropna(inplace=True)
BTC_cleanData.to_csv('C:/Users/Admin/Desktop/BTCdata.csv')
#Split Data into Training and Testing Set
#separate the features and targets into separate datasets.
#split the data into training and testing sets using a 70/30 split
#Using splicing, separate the features from the target into individual data sets.
X_all = BTC_cleanData.iloc[:, BTC_cleanData.columns != 'NextDayPrice'] # feature values for all days
y_all = BTC_cleanData['NextDayPrice'] # corresponding targets/labels
print (X_all.head()) # print the first 5 rows
#Split the data into training and testing sets using the given feature as the target
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size=0.30, random_state=42)
from sklearn.linear_model import LinearRegression
#Create a decision tree regressor and fit it to the training set
regressor = LinearRegression()
regressor.fit(X_train,y_train)
print ("Training set: {} samples".format(X_train.shape[0]))
print ("Test set: {} samples".format(X_test.shape[0]))
#Evaluate Model (out of sample Accuracy and Mean Squared Error)
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_score
scores = cross_val_score(regressor, X_test, y_test, cv=10)
print ("accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() / 2))
from sklearn.metrics import mean_squared_error
mse = mean_squared_error(y_test, regressor.predict(X_test))
print("MSE: %.4f" % mse)
#Evaluate Model (In sample Accuracy and Mean Squared Error)
trainscores = cross_val_score(regressor, X_train, y_train, cv=10)
print ("accuracy: %0.2f (+/- %0.2f)" % (trainscores.mean(), trainscores.std() / 2))
mse = mean_squared_error(y_train, regressor.predict(X_train))
print("MSE: %.4f" % mse)
print(regressor.predict(X_train))
#Predict Next Day Price
X=BTC_cleanData[-1:]
print(regressor.predict(X))
Solution
You have trained your model using the X_train
data. To predict the unseen data, you just need print(regressor.predict(X_test))
.
Before you had:
X=BTC_cleanData[-1:] # this has one more column compared to X_train and X_test
print(regressor.predict(X))
But BTC_cleanData[-1:]
has one more column compared to X_train and X_test. However, the model was trained using the X_train
that DOES NOT have this additional column, and this leads to the error.
Clean working code:
import pandas as pd
import numpy as np
import talib
import matplotlib.pyplot as plt
%matplotlib inline
import investpy
from investpy.crypto import get_crypto_historical_data
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
#Import open, high, low, close, volume and Return data from csv using investpy
BTC = get_crypto_historical_data(crypto='bitcoin', from_date='01/01/2014', to_date='06/08/2020')
#Convert Data from Int to Float
BTC.Volume = BTC.Volume.astype(float)
BTC.High = BTC.High.astype(float)
BTC.Low = BTC.Low.astype(float)
BTC.Close = BTC.Close.astype(float)
#Drop Unnecessary Columns
del BTC['Currency']
#Select Indicators as Features
BTC['AD'] = talib.AD(BTC['High'].values, BTC['Low'].values, BTC['Close'].values, BTC['Volume'].values)
#Create forward looking columns using shift
BTC['NextDayPrice'] = BTC['Close'].shift(-1)
#Copy dataframe and clean data
BTC_cleanData = BTC.copy()
BTC_cleanData.dropna(inplace=True)
#BTC_cleanData.to_csv('C:/Users/Admin/Desktop/BTCdata.csv')
#Split Data into Training and Testing Set
#separate the features and targets into separate datasets.
#split the data into training and testing sets using a 70/30 split
#Using splicing, separate the features from the target into individual data sets.
X_all = BTC_cleanData.iloc[:, BTC_cleanData.columns != 'NextDayPrice'] # feature values for all days
y_all = BTC_cleanData['NextDayPrice'] # corresponding targets/labels
print (X_all.head()) # print the first 5 rows
#Split the data into training and testing sets using the given feature as the target
X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size=0.30, random_state=42)
#Create a decision tree regressor and fit it to the training set
regressor = LinearRegression()
regressor.fit(X_train,y_train)
print ("Training set: {} samples".format(X_train.shape[0]))
print ("Test set: {} samples".format(X_test.shape[0]))
#Evaluate Model (out of sample Accuracy and Mean Squared Error)
scores = cross_val_score(regressor, X_test, y_test, cv=10)
print ("accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() / 2))
mse = mean_squared_error(y_test, regressor.predict(X_test))
print("MSE: %.4f" % mse)
#Evaluate Model (In sample Accuracy and Mean Squared Error)
trainscores = cross_val_score(regressor, X_train, y_train, cv=10)
print ("accuracy: %0.2f (+/- %0.2f)" % (trainscores.mean(), trainscores.std() / 2))
mse = mean_squared_error(y_train, regressor.predict(X_train))
print("MSE: %.4f" % mse)
print(regressor.predict(X_train))
#Predict Next Day Price
print(regressor.predict(X_test))
Answered By - seralouk
0 comments:
Post a Comment
Note: Only a member of this blog may post a comment.