#load csv file
from sklearn import metrics
from numpy import savetxt
import pandas as pd
from sklearn.svm import SVC
data_path = r'C:\Users\mbvsuraj\Documents\Python_Scripts\rrevif\wilsonchua\Project2\wilsonchuah-attachments'
df_train = pd.read_csv(data_path  +"\\train.csv")
df_test = pd.read_csv(data_path + "\\test.csv")
#fpb = pd.read_csv("~/Downloads/FPB.csv", encoding = "ISO-8859-1", header = None)
#fpb.columns = ["sentiment", "review"]
df_train[['label','review']].head()


import nltk
import re
import os 

# downloads nltk's model
nltk.download('punkt') 
nltk.download('stopwords')

# increase efficiency of stopwords
from nltk.corpus import stopwords
stop = set(stopwords.words('english'))


from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.stem import PorterStemmer 


ps = PorterStemmer() 


# create text preprocessing function, not removing Re
def pre_processing_by_nltk(doc, stemming = True, need_sent = False):
    # step 1: get sentences
    sentences = sent_tokenize(doc)
    # step 2: get tokens
    tokens = []
    for sent in sentences:
        words = word_tokenize(sent)
        # step 3 (optional): stemming
        if stemming:
            words = [ps.stem(word) for word in words]
        if need_sent:
            tokens.append(words)
        else:
            tokens += words
    return [w.lower() for w in tokens if w not in stop]

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\mbvsuraj\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\mbvsuraj\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# choose the first 5 row and apply preprocessing function

#fpb.review[:5].apply(pre_processing_by_nltk)

0    [accord, gran, ,, compani, ha, plan, move, pro...
1    [technopoli, plan, develop, stage, area, less,...
2    [intern, electron, industri, compani, elcoteq,...
3    [new, product, plant, compani, would, increas,...
4    [accord, compani, 's, updat, strategi, year, 2...
Name: review, dtype: object


from sklearn.model_selection import train_test_split
import pickle
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer

#np.set_printoptions(precision=2)

tfidf = TfidfVectorizer(lowercase=True, tokenizer=pre_processing_by_nltk)

y_tfidf = df_train['label'].values
#X_tfidf = tfidf.fit_transform(df_train['review'])

#Xt_train, Xt_remain, yt_train, yt_remain = train_test_split(X_tfidf, y_tfidf,random_state=42, test_size=0.3, shuffle=True)
#Xt_test, Xt_valid, yt_test, yt_valid = train_test_split(Xt_remain, yt_remain,random_state=42, test_size=0.5, shuffle=True)
#clf_tfidf = LogisticRegression(C=10,random_state=42,n_jobs=-1,max_iter=100).fit(Xt_train, yt_train)

# calculate the tfidf validation score
#print(clf_tfidf.score(Xt_valid, yt_valid))


total_data = df_train['review'].append(df_test['review'])
ss = tfidf.fit_transform(total_data)
X_tfidf = tfidf.transform(df_train['review'])


# train_X, test_X, train_y, test_y = train_test_split(X_tfidf,
#                                                     y_tfidf,
#                                                     test_size=0.3,
#                                                     random_state=1
#                                             )

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-19-329e5ad373b7> in <module>
      2                                                     y_tfidf,
      3                                                     test_size=0.3,
----> 4                                                     random_state=1
      5                                             )

~\AppData\Local\Continuum\anaconda3\lib\site-packages\sklearn\model_selection\_split.py in train_test_split(*arrays, **options)
   2125         raise TypeError("Invalid parameters passed: %s" % str(options))
   2126 
-> 2127     arrays = indexable(*arrays)
   2128 
   2129     n_samples = _num_samples(arrays[0])

~\AppData\Local\Continuum\anaconda3\lib\site-packages\sklearn\utils\validation.py in indexable(*iterables)
    290     """
    291     result = [_make_indexable(X) for X in iterables]
--> 292     check_consistent_length(*result)
    293     return result
    294 

~\AppData\Local\Continuum\anaconda3\lib\site-packages\sklearn\utils\validation.py in check_consistent_length(*arrays)
    254     if len(uniques) > 1:
    255         raise ValueError("Found input variables with inconsistent numbers of"
--> 256                          " samples: %r" % [int(l) for l in lengths])
    257 
    258 

ValueError: Found input variables with inconsistent numbers of samples: [13144, 13143]


# ## Comment the text before submitting
# gammas = [10, 100,1000]

# f1_table = pd.DataFrame(columns = ['degree','gamma','f1_score'])
# degrees = [ 2, 3, 4, 5, 6]
# f1_table['Solver'] = pd.Series(range(1,41))
# j=0
# for degree in degrees:
#     for i in gammas:
#         # Apply SVM regression model to training data
#         lr = SVC(kernel='poly', gamma=i,degree=degree)
#         lr.fit(train_X,train_y)

#         # Predict using model
#         y_pred = lr.predict(test_X)

#         # Saving f1 score score in table        
#         f1_table.iloc[j,2] = metrics.f1_score(test_y,y_pred,average='micro')
#         f1_table.iloc[j,1] = i
#         f1_table.iloc[j,0] =degree 
#         j += 1
# print(f1_table)

   degree gamma  f1_score  Solver
0       2    10  0.794371       1
1       2   100  0.794371       2
2       2  1000  0.794371       3
3       3    10  0.762424       4
4       3   100  0.762424       5
5       3  1000  0.762424       6
6       4    10  0.704361       7
7       4   100  0.704361       8
8       4  1000  0.704361       9
9       5    10  0.614604      10
10      5   100  0.614604      11
11      5  1000  0.614604      12
12      6    10  0.508367      13
13      6   100  0.508367      14
14      6  1000  0.508367      15
15    NaN   NaN       NaN      16
16    NaN   NaN       NaN      17
17    NaN   NaN       NaN      18
18    NaN   NaN       NaN      19
19    NaN   NaN       NaN      20
20    NaN   NaN       NaN      21
21    NaN   NaN       NaN      22
22    NaN   NaN       NaN      23
23    NaN   NaN       NaN      24
24    NaN   NaN       NaN      25
25    NaN   NaN       NaN      26
26    NaN   NaN       NaN      27
27    NaN   NaN       NaN      28
28    NaN   NaN       NaN      29
29    NaN   NaN       NaN      30
30    NaN   NaN       NaN      31
31    NaN   NaN       NaN      32
32    NaN   NaN       NaN      33
33    NaN   NaN       NaN      34
34    NaN   NaN       NaN      35
35    NaN   NaN       NaN      36
36    NaN   NaN       NaN      37
37    NaN   NaN       NaN      38
38    NaN   NaN       NaN      39
39    NaN   NaN       NaN      40


# ##Comment the text before submitting
# C_param_range = [0.1,1,10,100,1000]
# f1_table = pd.DataFrame(columns = ['Solver','C_parameter','f1_score'])
# solver_eg = ['saga','newton-cg','lbfgs','sag']
# f1_table['Solver'] = pd.Series(range(1,31))
# j=0

# for text in solver_eg:
#     for i in C_param_range:
#         # Apply logistic regression model to training data
#         lr = LogisticRegression( C = i,random_state = 42,multi_class='multinomial', solver=text,max_iter=100000000)
#         lr.fit(train_X,train_y)

#         # Predict using model
#         y_pred = lr.predict(test_X)

#         # Saving f1 score score in table        
#         f1_table.iloc[j,2] = metrics.f1_score(test_y,y_pred,average='weighted')
#         f1_table.iloc[j,1] = i
#         f1_table.iloc[j,0] =text
#         j += 1
# print(f1_table)

       Solver C_parameter  f1_score
0        saga         0.1  0.657831
1        saga           1  0.766212
2        saga          10   0.79263
3        saga         100  0.795402
4        saga        1000  0.792978
5   newton-cg         0.1  0.658153
6   newton-cg           1  0.766445
7   newton-cg          10   0.79263
8   newton-cg         100  0.794655
9   newton-cg        1000  0.793663
10      lbfgs         0.1  0.658153
11      lbfgs           1  0.766445
12      lbfgs          10  0.792643
13      lbfgs         100  0.794655
14      lbfgs        1000  0.793448
15        sag         0.1  0.658153
16        sag           1  0.766445
17        sag          10  0.792643
18        sag         100  0.794865
19        sag        1000  0.795453
20         21         NaN       NaN
21         22         NaN       NaN
22         23         NaN       NaN
23         24         NaN       NaN
24         25         NaN       NaN
25         26         NaN       NaN
26         27         NaN       NaN
27         28         NaN       NaN
28         29         NaN       NaN
29         30         NaN       NaN


# ## Comment the text before submitting
# #Final Submission Logistic Regression
# clf = LogisticRegression(C = 1000,random_state = 42,multi_class='multinomial', solver='sag',max_iter=100000000)
# clf.fit(train_X,train_y)
# y_pred = clf.predict(test_X)
# metrics.f1_score(test_y,y_pred,average='micro')
# #F1 score 79.54


# ## Comment the text before submitting
# #Final Submission SVM
# clf = SVC(kernel='poly', gamma=10,degree=2)
# clf.fit(train_X,train_y)
# y_pred = clf.predict(test_X)
# metrics.f1_score(test_y,y_pred,average='micro')
# #f1 score 79.56


# ## 1st model Logistic regression Final Submission
# clf = LogisticRegression(C = 1000,random_state = 42,multi_class='multinomial', solver='sag',max_iter=100000000) #Need to change values here depending upon the above table
# clf.fit(X_tfidf,y_tfidf)
# #transforming test to tfidf
# y_tfidf_test = tfidf.transform(df_test['review'])
# preds = clf.predict(y_tfidf_test)


# ## 1st model Logistic regression Final Submission
# # in your implemetation, create the output file using the same format
# dic = {"Id": [], "Predicted": []}
# for i, pred in enumerate(preds):
#     dic["Id"].append(i)
#     dic["Predicted"].append(pred)

# dic_df = pd.DataFrame.from_dict(dic)
# dic_df.to_csv(data_path + "predicted.csv", index=False)


## 2nd model Support Vector Machine Final Submission
clf = SVC(kernel='poly', gamma=10,degree=2) #Need to change values here depending upon the above table
clf.fit(X_tfidf,y_tfidf)
#transforming test to tfidf
y_tfidf_test = tfidf.transform(df_test['review'])
preds = clf.predict(y_tfidf_test)

---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-1-e1be44f87d0e> in <module>
      1 ## 2nd model Support Vector Machine Final Submission
----> 2 clf = SVC(kernel='poly', gamma=10,degree=2) #Need to change values here depending upon the above table
      3 clf.fit(X_tfidf,y_tfidf)
      4 #transforming test to tfidf
      5 y_tfidf_test = tfidf.transform(df_test['review'])

NameError: name 'SVC' is not defined


## 2nd model Support Vector Machine Final Submission
# in your implemetation, create the output file using the same format
dic = {"Id": [], "Predicted": []}
for i, pred in enumerate(preds):
    dic["Id"].append(i)
    dic["Predicted"].append(pred)

dic_df = pd.DataFrame.from_dict(dic)
dic_df.to_csv(data_path + "\predicted.csv", index=False)

	label	review
0	american (traditional)	So, we stopped here on our way to the Side Que...
1	american (new)	This is our go-to healthy spot! The food is al...
2	mexican	Food court meal at Gerrard Square. It's been ...
3	mexican	Located on Rainbow/Charleston, this small fami...
4	chinese	No frills Chinese takeout joint which serves u...

Name : Wilson Chuah Subject: MGTA 415 Homework 1¶

Problem 1: Text Pre-Processing¶

Tfidf vector¶