I've been stuck for 2 days making this machine learning code to work, to read a YouTube Comments CSV file and scan it for hate speech. I got this code from:
I added a code for reading a new CSV file of YouTube Comments that will be cleaned at the def function. The first Sample of testing a sentence works without any issues. But after that I want to read the CSV file and Scan it with the Machine learning who has already learned hate speech from the Twitter CSV file if I'm correct.
Unfortunately I am not able to scan the csv file and get a correct output I wanted. Tried many things but every time I get an different error or issue. I think I may be doing something simple wrong. The error occures at the last 3 lines of this code. (I've only been programming for 4 weeks so I'm kind of new to all of this.)
CSV Files:
from nltk.util import pr
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
import re
import nltk
stemmer = nltk.SnowballStemmer("english")
from nltk.corpus import stopwords
import string
stopword = set(stopwords.words('english'))
data = pd.read_csv("twitter.csv") # Read the twitter CSV file which the model learns from.
test2 = pd.read_csv("youtube-comments.csv") # Let the model scan this file to see if the model is smart enough to scan this.
test2.drop(["Time", "Likes", "Reply Count", "Name"], axis=1, inplace=True) # Drop columns we don't need in YouTube Comments CSV File
# # Use this if your missing packages of NTLK
data["labels"] = data["class"].map({0: "Hate Speech", # Create labels
1: "Offensive Language",
2: "No Hate and Offensive"})
data = data[["tweet", "labels"]]
# test2 = test2[["Comment", [data["labels"]]]]
def clean(text):
text = str(text).lower()
text = re.sub(r'[^\w]', ' ', text)
text = re.sub('\[.*?\]', '', text)
text = re.sub('https?://\S+|www\.\S+', '', text)
text = re.sub('<.*?>+', '', text)
text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
text = re.sub('\n', '', text)
text = re.sub('\w*\d\w*', '', text)
text = [word for word in text.split(' ') if word not in stopword]
text =" ".join(text)
text = [stemmer.stem(word) for word in text.split(' ')]
text =" ".join(text)
return text
data["tweet"] = data["tweet"].apply(clean)
test2["Comment"] = test2["Comment"].apply(clean) # Cleaning CSV File Data of YouTube Comments.
print(test2) # Printing out CSV test data to see if its cleaned
x = np.array(data["tweet"])
y = np.array(data["labels"])
cv = CountVectorizer()
X = cv.fit_transform(x) # Fit the Data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
clf = DecisionTreeClassifier(),y_train)
sample = "Let's unite and kill all the people who shot are fucking protesting against the government"
data = cv.transform([sample]).toarray()
print(clf.predict(data)) # This example works with a sentence
print("\n") # Now I want to try to read the CSV File and scan that for Hate speech. Here comes the error.
sample = test2
data = cv.transform([sample]).toarray()
output error (removed some print statements at start):
[40933 rows x 1 columns] ['No Hate and Offensive']
Traceback (most recent call last): File "C:\Users\Mango\YouTube Test
Hate", line 64, in <module>
data = cv.transform([sample]).toarray() File "C:\Users\Mango\anaconda3\lib\site-packages\sklearn\feature_extraction\",
line 1254, in transform
_, X = self._count_vocab(raw_documents, fixed_vocab=True) File "C:\Users\Mango\anaconda3\lib\site-packages\sklearn\feature_extraction\",
line 1114, in _count_vocab
for feature in analyze(doc): File "C:\Users\Mango\anaconda3\lib\site-packages\sklearn\feature_extraction\",
line 104, in _analyze
doc = preprocessor(doc) File "C:\Users\Mango\anaconda3\lib\site-packages\sklearn\feature_extraction\",
line 69, in _preprocess
doc = doc.lower() File "C:\Users\Mango\anaconda3\lib\site-packages\pandas\core\",
line 5487, in __getattr__
return object.__getattribute__(self, name) AttributeError: 'DataFrame' object has no attribute 'lower'
You have problem in
sample = test2
data = cv.transform([sample]).toarray()
is DataFrame
and it doesn't need []
sample = test2
data = cv.transform( sample ).toarray()
Frankly, you should get only one column from dataframe test2['Comment']
sample = test2['Comment']
data = cv.transform( sample ).toarray()
That's all.
Full working code
import re
import string
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.util import pr
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
# --- functions ---
def clean(text):
text = text.lower()
text = re.sub(r'[^\w]', ' ', text)
text = re.sub('\[.*?\]', '', text)
text = re.sub('https?://\S+|www\.\S+', '', text)
text = re.sub('<.*?>+', '', text)
text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
text = re.sub('\n', '', text)
text = re.sub('\w*\d\w*', '', text)
#text = [word for word in text.split(' ') if word not in stopword]
#text = " ".join(text)
#text = [stemmer.stem(word) for word in text.split(' ')]
# shorter
text = [stemmer.stem(word) for word in text.split(' ') if word not in stopword]
text = " ".join(text)
return text
# --- main ---
stemmer = nltk.SnowballStemmer("english")
stopword = set(stopwords.words('english'))
data = pd.read_csv("twitter.csv") # Read the twitter CSV file which the model learns from.
test = pd.read_csv("youtube-comments.csv") # Let the model scan this file to see if the model is smart enough to scan this.
# - clean -
test.drop(["Time", "Likes", "Reply Count", "Name"], axis=1, inplace=True) # Drop columns we don't need in YouTube Comments CSV File
# # Use this if your missing packages of NTLK
data["labels"] = data["class"].map({0: "Hate Speech", # Create labels
1: "Offensive Language",
2: "No Hate and Offensive"})
data = data[["tweet", "labels"]]
# test = test[["Comment", [data["labels"]]]]
data["tweet"] = data["tweet"].apply(clean)
test["Comment"] = test["Comment"].apply(clean) # Cleaning CSV File Data of YouTube Comments.
print("\n--- data ---\n")
print("\n--- test ---")
# - train -
x = np.array(data["tweet"])
y = np.array(data["labels"])
cv = CountVectorizer()
X = cv.fit_transform(x)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
clf = DecisionTreeClassifier(), y_train)
# - prediction 1 -
print('\n--- prediction 1 ---\n')
sample = ["Let's unite and kill all the people who shot are fucking protesting against the government"]
data = cv.transform( sample )#.toarray()
print('- sample -\n')
print('\n- prediction -\n')
# - prediction 2 -
print('\n--- prediction 2 ---\n')
sample = test['Comment']
data = cv.transform( sample )#.toarray()
print('- sample -\n')
print('\n- prediction -\n')
# ...
--- prediction 1 ---
- sample -
["Let's unite and kill all the people who shot are fucking protesting against the government"]
- prediction -
['No Hate and Offensive']
--- prediction 2 ---
- sample -
0 join us countdown reveal next battlefield sta...
1 darrel swatten thx ill recruit new helper vol...
2 prez mrm great innov onlin industri call look...
3 darrel swatten copi clone bfv game base bfv
4 darrel swatten ill make game base bfv new we...
40928 origin twist steel trailer teas new plane cras...
40929 banzaaaaaaiiiii
40930 kamikaz instead v rocket
40931 emperor
40932 baanzzaaii
Name: Comment, Length: 40933, dtype: object
- prediction -
['No Hate and Offensive' 'No Hate and Offensive' 'No Hate and Offensive'
... 'No Hate and Offensive' 'No Hate and Offensive'
'No Hate and Offensive']
Answered By - furas
Post a Comment
Note: Only a member of this blog may post a comment.