#load csv file
from sklearn import metrics
from numpy import savetxt
import pandas as pd
from sklearn.svm import SVC
data_path = r'C:\Users\mbvsuraj\Documents\Python_Scripts\rrevif\wilsonchua\Project2\wilsonchuah-attachments'
df_train = pd.read_csv(data_path +"\\train.csv")
df_test = pd.read_csv(data_path + "\\test.csv")
#fpb = pd.read_csv("~/Downloads/FPB.csv", encoding = "ISO-8859-1", header = None)
#fpb.columns = ["sentiment", "review"]
df_train[['label','review']].head()
label | review | |
---|---|---|
0 | american (traditional) | So, we stopped here on our way to the Side Que... |
1 | american (new) | This is our go-to healthy spot! The food is al... |
2 | mexican | Food court meal at Gerrard Square. It's been ... |
3 | mexican | Located on Rainbow/Charleston, this small fami... |
4 | chinese | No frills Chinese takeout joint which serves u... |
import nltk
import re
import os
# downloads nltk's model
nltk.download('punkt')
nltk.download('stopwords')
# increase efficiency of stopwords
from nltk.corpus import stopwords
stop = set(stopwords.words('english'))
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.stem import PorterStemmer
ps = PorterStemmer()
# create text preprocessing function, not removing Re
def pre_processing_by_nltk(doc, stemming = True, need_sent = False):
# step 1: get sentences
sentences = sent_tokenize(doc)
# step 2: get tokens
tokens = []
for sent in sentences:
words = word_tokenize(sent)
# step 3 (optional): stemming
if stemming:
words = [ps.stem(word) for word in words]
if need_sent:
tokens.append(words)
else:
tokens += words
return [w.lower() for w in tokens if w not in stop]
[nltk_data] Downloading package punkt to [nltk_data] C:\Users\mbvsuraj\AppData\Roaming\nltk_data... [nltk_data] Package punkt is already up-to-date! [nltk_data] Downloading package stopwords to [nltk_data] C:\Users\mbvsuraj\AppData\Roaming\nltk_data... [nltk_data] Package stopwords is already up-to-date!
# choose the first 5 row and apply preprocessing function
#fpb.review[:5].apply(pre_processing_by_nltk)
0 [accord, gran, ,, compani, ha, plan, move, pro... 1 [technopoli, plan, develop, stage, area, less,... 2 [intern, electron, industri, compani, elcoteq,... 3 [new, product, plant, compani, would, increas,... 4 [accord, compani, 's, updat, strategi, year, 2... Name: review, dtype: object
from sklearn.model_selection import train_test_split
import pickle
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
#np.set_printoptions(precision=2)
tfidf = TfidfVectorizer(lowercase=True, tokenizer=pre_processing_by_nltk)
y_tfidf = df_train['label'].values
#X_tfidf = tfidf.fit_transform(df_train['review'])
#Xt_train, Xt_remain, yt_train, yt_remain = train_test_split(X_tfidf, y_tfidf,random_state=42, test_size=0.3, shuffle=True)
#Xt_test, Xt_valid, yt_test, yt_valid = train_test_split(Xt_remain, yt_remain,random_state=42, test_size=0.5, shuffle=True)
#clf_tfidf = LogisticRegression(C=10,random_state=42,n_jobs=-1,max_iter=100).fit(Xt_train, yt_train)
# calculate the tfidf validation score
#print(clf_tfidf.score(Xt_valid, yt_valid))
total_data = df_train['review'].append(df_test['review'])
ss = tfidf.fit_transform(total_data)
X_tfidf = tfidf.transform(df_train['review'])
# train_X, test_X, train_y, test_y = train_test_split(X_tfidf,
# y_tfidf,
# test_size=0.3,
# random_state=1
# )
--------------------------------------------------------------------------- ValueError Traceback (most recent call last) <ipython-input-19-329e5ad373b7> in <module> 2 y_tfidf, 3 test_size=0.3, ----> 4 random_state=1 5 ) ~\AppData\Local\Continuum\anaconda3\lib\site-packages\sklearn\model_selection\_split.py in train_test_split(*arrays, **options) 2125 raise TypeError("Invalid parameters passed: %s" % str(options)) 2126 -> 2127 arrays = indexable(*arrays) 2128 2129 n_samples = _num_samples(arrays[0]) ~\AppData\Local\Continuum\anaconda3\lib\site-packages\sklearn\utils\validation.py in indexable(*iterables) 290 """ 291 result = [_make_indexable(X) for X in iterables] --> 292 check_consistent_length(*result) 293 return result 294 ~\AppData\Local\Continuum\anaconda3\lib\site-packages\sklearn\utils\validation.py in check_consistent_length(*arrays) 254 if len(uniques) > 1: 255 raise ValueError("Found input variables with inconsistent numbers of" --> 256 " samples: %r" % [int(l) for l in lengths]) 257 258 ValueError: Found input variables with inconsistent numbers of samples: [13144, 13143]
Section Parameter Tuning 1
# ## Comment the text before submitting
# gammas = [10, 100,1000]
# f1_table = pd.DataFrame(columns = ['degree','gamma','f1_score'])
# degrees = [ 2, 3, 4, 5, 6]
# f1_table['Solver'] = pd.Series(range(1,41))
# j=0
# for degree in degrees:
# for i in gammas:
# # Apply SVM regression model to training data
# lr = SVC(kernel='poly', gamma=i,degree=degree)
# lr.fit(train_X,train_y)
# # Predict using model
# y_pred = lr.predict(test_X)
# # Saving f1 score score in table
# f1_table.iloc[j,2] = metrics.f1_score(test_y,y_pred,average='micro')
# f1_table.iloc[j,1] = i
# f1_table.iloc[j,0] =degree
# j += 1
# print(f1_table)
degree gamma f1_score Solver 0 2 10 0.794371 1 1 2 100 0.794371 2 2 2 1000 0.794371 3 3 3 10 0.762424 4 4 3 100 0.762424 5 5 3 1000 0.762424 6 6 4 10 0.704361 7 7 4 100 0.704361 8 8 4 1000 0.704361 9 9 5 10 0.614604 10 10 5 100 0.614604 11 11 5 1000 0.614604 12 12 6 10 0.508367 13 13 6 100 0.508367 14 14 6 1000 0.508367 15 15 NaN NaN NaN 16 16 NaN NaN NaN 17 17 NaN NaN NaN 18 18 NaN NaN NaN 19 19 NaN NaN NaN 20 20 NaN NaN NaN 21 21 NaN NaN NaN 22 22 NaN NaN NaN 23 23 NaN NaN NaN 24 24 NaN NaN NaN 25 25 NaN NaN NaN 26 26 NaN NaN NaN 27 27 NaN NaN NaN 28 28 NaN NaN NaN 29 29 NaN NaN NaN 30 30 NaN NaN NaN 31 31 NaN NaN NaN 32 32 NaN NaN NaN 33 33 NaN NaN NaN 34 34 NaN NaN NaN 35 35 NaN NaN NaN 36 36 NaN NaN NaN 37 37 NaN NaN NaN 38 38 NaN NaN NaN 39 39 NaN NaN NaN 40
Section Parameter tuning 2
# ##Comment the text before submitting
# C_param_range = [0.1,1,10,100,1000]
# f1_table = pd.DataFrame(columns = ['Solver','C_parameter','f1_score'])
# solver_eg = ['saga','newton-cg','lbfgs','sag']
# f1_table['Solver'] = pd.Series(range(1,31))
# j=0
# for text in solver_eg:
# for i in C_param_range:
# # Apply logistic regression model to training data
# lr = LogisticRegression( C = i,random_state = 42,multi_class='multinomial', solver=text,max_iter=100000000)
# lr.fit(train_X,train_y)
# # Predict using model
# y_pred = lr.predict(test_X)
# # Saving f1 score score in table
# f1_table.iloc[j,2] = metrics.f1_score(test_y,y_pred,average='weighted')
# f1_table.iloc[j,1] = i
# f1_table.iloc[j,0] =text
# j += 1
# print(f1_table)
Solver C_parameter f1_score 0 saga 0.1 0.657831 1 saga 1 0.766212 2 saga 10 0.79263 3 saga 100 0.795402 4 saga 1000 0.792978 5 newton-cg 0.1 0.658153 6 newton-cg 1 0.766445 7 newton-cg 10 0.79263 8 newton-cg 100 0.794655 9 newton-cg 1000 0.793663 10 lbfgs 0.1 0.658153 11 lbfgs 1 0.766445 12 lbfgs 10 0.792643 13 lbfgs 100 0.794655 14 lbfgs 1000 0.793448 15 sag 0.1 0.658153 16 sag 1 0.766445 17 sag 10 0.792643 18 sag 100 0.794865 19 sag 1000 0.795453 20 21 NaN NaN 21 22 NaN NaN 22 23 NaN NaN 23 24 NaN NaN 24 25 NaN NaN 25 26 NaN NaN 26 27 NaN NaN 27 28 NaN NaN 28 29 NaN NaN 29 30 NaN NaN
# ## Comment the text before submitting
# #Final Submission Logistic Regression
# clf = LogisticRegression(C = 1000,random_state = 42,multi_class='multinomial', solver='sag',max_iter=100000000)
# clf.fit(train_X,train_y)
# y_pred = clf.predict(test_X)
# metrics.f1_score(test_y,y_pred,average='micro')
# #F1 score 79.54
# ## Comment the text before submitting
# #Final Submission SVM
# clf = SVC(kernel='poly', gamma=10,degree=2)
# clf.fit(train_X,train_y)
# y_pred = clf.predict(test_X)
# metrics.f1_score(test_y,y_pred,average='micro')
# #f1 score 79.56
Change values in the below 1st Model Logistic regression depending upon "Section Parameter tuning 2"
# ## 1st model Logistic regression Final Submission
# clf = LogisticRegression(C = 1000,random_state = 42,multi_class='multinomial', solver='sag',max_iter=100000000) #Need to change values here depending upon the above table
# clf.fit(X_tfidf,y_tfidf)
# #transforming test to tfidf
# y_tfidf_test = tfidf.transform(df_test['review'])
# preds = clf.predict(y_tfidf_test)
# ## 1st model Logistic regression Final Submission
# # in your implemetation, create the output file using the same format
# dic = {"Id": [], "Predicted": []}
# for i, pred in enumerate(preds):
# dic["Id"].append(i)
# dic["Predicted"].append(pred)
# dic_df = pd.DataFrame.from_dict(dic)
# dic_df.to_csv(data_path + "predicted.csv", index=False)
Change values in the below 2st Model Support Vector Machine depending upon "Section Parameter tuning 1"
## 2nd model Support Vector Machine Final Submission
clf = SVC(kernel='poly', gamma=10,degree=2) #Need to change values here depending upon the above table
clf.fit(X_tfidf,y_tfidf)
#transforming test to tfidf
y_tfidf_test = tfidf.transform(df_test['review'])
preds = clf.predict(y_tfidf_test)
--------------------------------------------------------------------------- NameError Traceback (most recent call last) <ipython-input-1-e1be44f87d0e> in <module> 1 ## 2nd model Support Vector Machine Final Submission ----> 2 clf = SVC(kernel='poly', gamma=10,degree=2) #Need to change values here depending upon the above table 3 clf.fit(X_tfidf,y_tfidf) 4 #transforming test to tfidf 5 y_tfidf_test = tfidf.transform(df_test['review']) NameError: name 'SVC' is not defined
## 2nd model Support Vector Machine Final Submission
# in your implemetation, create the output file using the same format
dic = {"Id": [], "Predicted": []}
for i, pred in enumerate(preds):
dic["Id"].append(i)
dic["Predicted"].append(pred)
dic_df = pd.DataFrame.from_dict(dic)
dic_df.to_csv(data_path + "\predicted.csv", index=False)