Issue
I have defined the following function that returns the AUC and PRC scores for training and test datasets which you can find through the links below: train dataset https://drive.google.com/file/d/1466SDm1nOpeDb_3UnW8Qjc1VEY_Be0R5/view?usp=sharing test dataset https://drive.google.com/file/d/1vphjb3xbrklhLHNMYUexN6X_axepm0Xy/view?usp=sharing
Both of the datasets have samples in the following format. The text column contains documents, and the label column gives the sentiment of each document.
label text 1 I must admit that I'm addicted to "Version 2.0... 0 I think it's such a shame that an enormous tal... 1 The Sunsout No Room at The Inn Puzzle has oddl... ... ...
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn import svm
from sklearn.svm import LinearSVC
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import roc_curve, auc,precision_recall_curve
from sklearn.metrics import classification_report
from matplotlib import pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
train = pd.read_csv("train5-1.csv")
test = pd.read_csv("test5.csv")
def create_model(train_docs, train_y, test_docs, test_y, \
model_type='svm', stop_words=None, min_df=1, print_result = True, algorithm_para=1.0):
tfidf_vect = TfidfVectorizer(stop_words=stop_words,min_df=min_df)
tfidf_vect.fit_transform(train["text"])
y_test=test['label'].values
y_train=train["label"].values
X_train=tfidf_vect.fit_transform(train['text'].values)
X_test=tfidf_vect.transform(test['text'].values)
if 'svm' in model_type:
clf = svm.SVC(kernel='linear',probability=True)
clf=svm.LinearSVC(C=algorithm_para).fit(X_train, y_train)
predicted=clf.predict(X_test)
labels=sorted(train['label'].unique())
precision, recall, fscore, support=\
precision_recall_fscore_support(\
y_test, predicted, labels=labels)
if print_result==True:
print("labels: ", labels)
print("precision: ", precision)
print("recall: ", recall)
print("f-score: ", fscore)
print("support: ", support)
predict_p=clf._predict_proba_lr(X_test)
labels
predict_p[0:3]
y_test[0:3]
y_pred = predict_p[:,1]
fpr, tpr, thresholds = roc_curve(y_test,y_pred, pos_label=1)
precision, recall, thresholds = precision_recall_curve(y_test, y_pred, pos_label=1)
auc_score= auc(fpr, tpr)
prc_score=auc(recall, precision)
if print_result==True:
print("AUC: {:.2%}".format(auc_score), "PRC: {:.2%}".format(prc_score))
plt.figure();
plt.plot(fpr, tpr, color='darkorange', lw=2);
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--');
plt.xlim([0.0, 1.0]);
plt.ylim([0.0, 1.05]);
plt.xlabel('False Positive Rate');
plt.ylabel('True Positive Rate');
plt.title('AUC of SVM Model');
plt.show()
plt.figure();
plt.plot(recall, precision, color='darkorange', lw=2);
plt.xlim([0.0, 1.0]);
plt.ylim([0.0, 1.05]);
plt.xlabel('Recall');
plt.ylabel('Precision');
plt.title('Precision_Recall_Curve of SVM Model');
plt.show();
else:
clf=MultinomialNB(alpha=algorithm_para).fit(X_train, y_train)
predicted=clf.predict(X_test)
labels=sorted(train['label'].unique())
precision, recall, fscore, support=\
precision_recall_fscore_support(\
y_test, predicted, labels=labels)
if print_result==True:
print("labels: ", labels)
print("precision: ", precision)
print("recall: ", recall)
print("f-score: ", fscore)
print("support: ", support)
predict_p=clf.predict_proba(X_test)
labels
predict_p[0:3]
y_test[0:3]
y_pred = predict_p[:,1]
fpr, tpr, thresholds = roc_curve(y_test,y_pred, pos_label=1)
precision, recall, thresholds = precision_recall_curve(y_test, y_pred, pos_label=1)
auc_score= auc(fpr, tpr)
prc_score=auc(recall, precision)
if print_result==True:
print("AUC: {:.2%}".format(auc_score), "PRC: {:.2%}".format(prc_score))
plt.figure();
plt.plot(fpr, tpr, color='darkorange', lw=2);
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--');
plt.xlim([0.0, 1.0]);
plt.ylim([0.0, 1.05]);
plt.xlabel('False Positive Rate');
plt.ylabel('True Positive Rate');
plt.title('AUC of SVM Model');
plt.show()
plt.figure();
plt.plot(recall, precision, color='darkorange', lw=2);
plt.xlim([0.0, 1.0]);
plt.ylim([0.0, 1.05]);
plt.xlabel('Recall');
plt.ylabel('Precision');
plt.title('Precision_Recall_Curve of SVM Model');
plt.show();
return auc_score, prc_score
Then, to investigate the impact of sample size on the above classifier performance, I defined another function as follows:
def sample_size_impact(train_docs, train_y, test_docs, test_y):
auc_list_svm=[]
t_size = np.linspace(500,12000, 24)
for i in range (int(len(train_docs)/500)):
auc_score_svm= create_model(train_docs[:(i+1)*500], train_y[:(i+1)*500], test_docs, test_y, \
model_type='svm', stop_words = 'english', min_df = 1, print_result=False, algorithm_para=1.0)
auc_list_svm.append(auc_score_svm)
plt.figure();
plt.plot(auc_list_svm, color='darkorange');
plt.xlabel('Smple Size');
plt.ylabel('AUC');
plt.title('sample size impact comparison');
plt.show()
But the sample_size_impact function is not working correctly. Would you please investigate my code and tell me where I made a mistake?
Solution
You have an error in create_model
, you're using the global (full) training data train
every time instead of the argument train_docs
. It should be:
tfidf_vect.fit_transform(train_docs["text"])
y_test=test_docs['label'].values
y_train=train_docs["label"].values
X_train=tfidf_vect.fit_transform(train_docs['text'].values)
X_test=tfidf_vect.transform(test_docs['text'].values)
Answered By - Erwan
0 comments:
Post a Comment
Note: Only a member of this blog may post a comment.