Issue
I've been stuck for 2 days making this machine learning code to work, to read a YouTube Comments CSV file and scan it for hate speech. I got this code from: https://thecleverprogrammer.com/2021/07/25/hate-speech-detection-with-machine-learning/
I added a code for reading a new CSV file of YouTube Comments that will be cleaned at the def function. The first Sample of testing a sentence works without any issues. But after that I want to read the CSV file and Scan it with the Machine learning who has already learned hate speech from the Twitter CSV file if I'm correct.
Unfortunately I am not able to scan the csv file and get a correct output I wanted. Tried many things but every time I get an different error or issue. I think I may be doing something simple wrong. The error occures at the last 3 lines of this code. (I've only been programming for 4 weeks so I'm kind of new to all of this.)
CSV Files: https://drive.google.com/drive/folders/1nAIE5gAlMOx89vhqmVpaLGYESCVJkDZk?usp=sharing
from nltk.util import pr
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
import re
import nltk
stemmer = nltk.SnowballStemmer("english")
from nltk.corpus import stopwords
import string
stopword = set(stopwords.words('english'))
data = pd.read_csv("twitter.csv") # Read the twitter CSV file which the model learns from.
test2 = pd.read_csv("youtube-comments.csv") # Let the model scan this file to see if the model is smart enough to scan this.
test2.drop(["Time", "Likes", "Reply Count", "Name"], axis=1, inplace=True) # Drop columns we don't need in YouTube Comments CSV File
# nltk.download() # Use this if your missing packages of NTLK
data["labels"] = data["class"].map({0: "Hate Speech", # Create labels
1: "Offensive Language",
2: "No Hate and Offensive"})
data = data[["tweet", "labels"]]
# test2 = test2[["Comment", [data["labels"]]]]
print("\n")
print(data.head())
def clean(text):
text = str(text).lower()
text = re.sub(r'[^\w]', ' ', text)
text = re.sub('\[.*?\]', '', text)
text = re.sub('https?://\S+|www\.\S+', '', text)
text = re.sub('<.*?>+', '', text)
text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
text = re.sub('\n', '', text)
text = re.sub('\w*\d\w*', '', text)
text = [word for word in text.split(' ') if word not in stopword]
text =" ".join(text)
text = [stemmer.stem(word) for word in text.split(' ')]
text =" ".join(text)
return text
data["tweet"] = data["tweet"].apply(clean)
test2["Comment"] = test2["Comment"].apply(clean) # Cleaning CSV File Data of YouTube Comments.
print("\n")
print(test2) # Printing out CSV test data to see if its cleaned
x = np.array(data["tweet"])
y = np.array(data["labels"])
cv = CountVectorizer()
X = cv.fit_transform(x) # Fit the Data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
clf = DecisionTreeClassifier()
clf.fit(X_train,y_train)
sample = "Let's unite and kill all the people who shot are fucking protesting against the government"
data = cv.transform([sample]).toarray()
print(clf.predict(data)) # This example works with a sentence
print("\n") # Now I want to try to read the CSV File and scan that for Hate speech. Here comes the error.
sample = test2
data = cv.transform([sample]).toarray()
print(clf.predict(data))
output error (removed some print statements at start):
[40933 rows x 1 columns] ['No Hate and Offensive']
Traceback (most recent call last): File "C:\Users\Mango\YouTube Test
Hate Speech.py", line 64, in <module>
data = cv.transform([sample]).toarray() File "C:\Users\Mango\anaconda3\lib\site-packages\sklearn\feature_extraction\text.py",
line 1254, in transform
_, X = self._count_vocab(raw_documents, fixed_vocab=True) File "C:\Users\Mango\anaconda3\lib\site-packages\sklearn\feature_extraction\text.py",
line 1114, in _count_vocab
for feature in analyze(doc): File "C:\Users\Mango\anaconda3\lib\site-packages\sklearn\feature_extraction\text.py",
line 104, in _analyze
doc = preprocessor(doc) File "C:\Users\Mango\anaconda3\lib\site-packages\sklearn\feature_extraction\text.py",
line 69, in _preprocess
doc = doc.lower() File "C:\Users\Mango\anaconda3\lib\site-packages\pandas\core\generic.py",
line 5487, in __getattr__
return object.__getattribute__(self, name) AttributeError: 'DataFrame' object has no attribute 'lower'
Solution
You have problem in
sample = test2
data = cv.transform([sample]).toarray()
test2
is DataFrame
and it doesn't need []
sample = test2
data = cv.transform( sample ).toarray()
Frankly, you should get only one column from dataframe test2['Comment']
sample = test2['Comment']
data = cv.transform( sample ).toarray()
That's all.
EDIT:
Full working code
import re
import string
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.util import pr
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
# --- functions ---
def clean(text):
text = text.lower()
text = re.sub(r'[^\w]', ' ', text)
text = re.sub('\[.*?\]', '', text)
text = re.sub('https?://\S+|www\.\S+', '', text)
text = re.sub('<.*?>+', '', text)
text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
text = re.sub('\n', '', text)
text = re.sub('\w*\d\w*', '', text)
#text = [word for word in text.split(' ') if word not in stopword]
#text = " ".join(text)
#text = [stemmer.stem(word) for word in text.split(' ')]
# shorter
text = [stemmer.stem(word) for word in text.split(' ') if word not in stopword]
text = " ".join(text)
return text
# --- main ---
stemmer = nltk.SnowballStemmer("english")
stopword = set(stopwords.words('english'))
data = pd.read_csv("twitter.csv") # Read the twitter CSV file which the model learns from.
test = pd.read_csv("youtube-comments.csv") # Let the model scan this file to see if the model is smart enough to scan this.
# - clean -
test.drop(["Time", "Likes", "Reply Count", "Name"], axis=1, inplace=True) # Drop columns we don't need in YouTube Comments CSV File
# nltk.download() # Use this if your missing packages of NTLK
data["labels"] = data["class"].map({0: "Hate Speech", # Create labels
1: "Offensive Language",
2: "No Hate and Offensive"})
data = data[["tweet", "labels"]]
# test = test[["Comment", [data["labels"]]]]
data["tweet"] = data["tweet"].apply(clean)
test["Comment"] = test["Comment"].apply(clean) # Cleaning CSV File Data of YouTube Comments.
print("\n--- data ---\n")
print(data.head())
print("\n--- test ---")
print(test.head())
# - train -
x = np.array(data["tweet"])
y = np.array(data["labels"])
cv = CountVectorizer()
X = cv.fit_transform(x)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
clf = DecisionTreeClassifier()
clf.fit(X_train, y_train)
# - prediction 1 -
print('\n--- prediction 1 ---\n')
sample = ["Let's unite and kill all the people who shot are fucking protesting against the government"]
data = cv.transform( sample )#.toarray()
print('- sample -\n')
print(sample)
print('\n- prediction -\n')
print(clf.predict(data))
# - prediction 2 -
print('\n--- prediction 2 ---\n')
sample = test['Comment']
data = cv.transform( sample )#.toarray()
print('- sample -\n')
print(sample)
print('\n- prediction -\n')
print(clf.predict(data))
Result:
# ...
--- prediction 1 ---
- sample -
["Let's unite and kill all the people who shot are fucking protesting against the government"]
- prediction -
['No Hate and Offensive']
--- prediction 2 ---
- sample -
0 join us countdown reveal next battlefield sta...
1 darrel swatten thx ill recruit new helper vol...
2 prez mrm great innov onlin industri call look...
3 darrel swatten copi clone bfv game base bfv
4 darrel swatten ill make game base bfv new we...
...
40928 origin twist steel trailer teas new plane cras...
40929 banzaaaaaaiiiii
40930 kamikaz instead v rocket
40931 emperor
40932 baanzzaaii
Name: Comment, Length: 40933, dtype: object
- prediction -
['No Hate and Offensive' 'No Hate and Offensive' 'No Hate and Offensive'
... 'No Hate and Offensive' 'No Hate and Offensive'
'No Hate and Offensive']
Answered By - furas
0 comments:
Post a Comment
Note: Only a member of this blog may post a comment.