Issue
I have a CSV file of
lemma,trained
iran seizes bitcoin mining machines power spike,-1
... (goes on for 1054 lines)
And my code looks like:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
df = pd.read_csv('lemma copy.csv')
X = df.iloc[:, 0].values
y = df.iloc[:, 1].values
print(y)
X_train, X_test, y_train, y_test =train_test_split(X,y,test_size= 0.25, random_state=0)
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
I am getting the error
Traceback (most recent call last):
File "/home/arctesian/Scripts/School/EE/Algos/Qual/bayes/sklean.py", line 20, in <module>
X_train = sc_X.fit_transform(X_train)
File "/home/arctesian/.local/lib/python3.10/site-packages/sklearn/base.py", line 867, in fit_transform
return self.fit(X, **fit_params).transform(X)
File "/home/arctesian/.local/lib/python3.10/site-packages/sklearn/preprocessing/_data.py", line 809, in fit
return self.partial_fit(X, y, sample_weight)
File "/home/arctesian/.local/lib/python3.10/site-packages/sklearn/preprocessing/_data.py", line 844, in partial_fit
X = self._validate_data(
File "/home/arctesian/.local/lib/python3.10/site-packages/sklearn/base.py", line 577, in _validate_data
X = check_array(X, input_name="X", **check_params)
File "/home/arctesian/.local/lib/python3.10/site-packages/sklearn/utils/validation.py", line 856, in check_array
array = np.asarray(array, order=order, dtype=dtype)
ValueError: could not convert string to float: 'twitter ios beta lays groundwork bitcoin tips'
Printing this out shows that the random splitting of the data makes that line the first line so it must be a problem with trans coding the data. How do I fix this problem?
Solution
So I fixed it by using @joshua megauth method and getting rid of pandas. Did this:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from coalas import csvReader as c
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
# df = pd.read_csv('lemma copy.csv')
def vect(X):
features = vectorizer.fit_transform(X)
features_nd = features.toarray()
return features_nd
def test():
y_pred = classifer.predict(X_test)
print(accuracy_score(y_pred, y_test))
if __name__ == "__main__":
c.importCSV('lemma copy.csv')
vectorizer = CountVectorizer(
analyzer = 'word',
lowercase = False,
)
X = c.lemma
# y = c.Best
y = c.trained
features_nd = vect(X)
X_train, X_test, y_train, y_test =train_test_split(features_nd,y,test_size= 0.2, random_state=0)
sc_X = StandardScaler()
# print(X_train)
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.fit_transform(X_test)
classifer = GaussianNB()
classifer.fit(X_train, y_train)
test()
Answered By - Daniel Okita
0 comments:
Post a Comment
Note: Only a member of this blog may post a comment.