import pandas as pd
df = pd.read_csv('movie_data.csv')
df.head(10)
review | sentiment | |
---|---|---|
0 | In 1974, the teenager Martha Moxley (Maggie Gr... | 1 |
1 | OK... so... I really like Kris Kristofferson a... | 0 |
2 | ***SPOILER*** Do not read this, if you think a... | 0 |
3 | hi for all the people who have seen this wonde... | 1 |
4 | I recently bought the DVD, forgetting just how... | 0 |
5 | Leave it to Braik to put on a good show. Final... | 1 |
6 | Nathan Detroit (Frank Sinatra) is the manager ... | 1 |
7 | To understand "Crash Course" in the right cont... | 1 |
8 | I've been impressed with Chavez's stance again... | 1 |
9 | This movie is directed by Renny Harlin the fin... | 1 |
corpus = list(df.review)[:10000]
import re
import en_core_web_sm
nlp = en_core_web_sm.load()
def pre_process(s):
s = re.sub(r'\W+', ' ', s)
return nlp(s)
pre_process('This is a simple preprocessing-function.')
This is a simple preprocessing function
from tqdm import tqdm
docs = []
for idx, review in tqdm(enumerate(corpus), total = len(corpus)):
doc = pre_process(review)
docs.append((idx, review, doc))
100%|██████████| 10000/10000 [04:55<00:00, 33.85it/s]
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
stopword_set = set(stopwords.words('english'))
[nltk_data] Downloading package stopwords to [nltk_data] /Users/shangjingbo/nltk_data... [nltk_data] Package stopwords is already up-to-date!
from collections import defaultdict
from math import log, sqrt
import numpy as np
from numpy import linalg as LA
from gensim.models import Word2Vec, LdaModel
from gensim.corpora import Dictionary
def fit_IDF(docs, min_df = 1):
print(f'# of docs = {len(docs)}')
DF = defaultdict(float)
for (idx, raw_text, doc) in docs:
token_set = set([token.lemma_ for token in doc if token.text.lower() not in stopword_set])
for token in token_set:
DF[token] += 1
IDF = defaultdict(float)
for token in DF:
if (DF[token] >= min_df) and (DF[token] < len(docs)):
IDF[token] = log(len(docs) / DF[token])
else:
print('boilerplate token =', token)
print(f'# of words in index: {len(IDF)}')
return IDF
IDF = fit_IDF(docs)
# of docs = 10000 # of words in index: 54046
def fit_w2v(docs, dimension):
sentences = []
for (idx, raw_text, doc) in docs:
for sent in doc.sents:
sentence = []
for token in sent:
sentence.append(token.lemma_)
sentences.append(sentence)
w2v_model = Word2Vec(min_count=1,
window=5,
size=dimension,
sample=6e-5,
alpha=0.1,
min_alpha=0.0007,
negative=20)
print(f'# of sentences for word2vec = {len(sentences)}')
w2v_model.build_vocab(sentences, progress_per=10000)
w2v_model.train(sentences, total_examples=w2v_model.corpus_count, epochs=50, report_delay=1)
print(f'traind: {w2v_model}')
return w2v_model
w2v = fit_w2v(docs, 100)
print(w2v.similarity('good', 'bad'))
print(w2v.similarity('good', 'great'))
# of sentences for word2vec = 153836 traind: Word2Vec(vocab=54321, size=100, alpha=0.1) 0.6447691 0.757915
/Users/shangjingbo/opt/anaconda3/lib/python3.7/site-packages/ipykernel_launcher.py:24: DeprecationWarning: Call to deprecated `similarity` (Method will be removed in 4.0.0, use self.wv.similarity() instead). /Users/shangjingbo/opt/anaconda3/lib/python3.7/site-packages/ipykernel_launcher.py:25: DeprecationWarning: Call to deprecated `similarity` (Method will be removed in 4.0.0, use self.wv.similarity() instead).
'''
docs are supposed to be a list of Spacy-parsed documents.
'''
def build_lda(docs, num_topics = 5, verbose = False):
corpus = []
for (idx, review, doc) in docs:
tokenized_doc = []
for token in doc:
tokenized_doc.append(token.lemma_)
corpus.append(tokenized_doc)
# Create a dictionary representation of the documents.
dictionary = Dictionary(corpus)
# Filter out words that occur less than 20 documents, or more than 50% of the documents.
dictionary.filter_extremes(no_below=20, no_above=0.5)
# Bag-of-words representation of the documents.
corpus = [dictionary.doc2bow(doc) for doc in corpus]
print('Number of unique tokens: %d' % len(dictionary))
print('Number of documents: %d' % len(corpus))
# Set training parameters.
chunksize = 2000
passes = 20
iterations = 400
eval_every = None # Don't evaluate model perplexity, takes too much time.
# Make a index to word dictionary.
temp = dictionary[0] # This is only to "load" the dictionary.
id2word = dictionary.id2token
model = LdaModel(
corpus=corpus,
id2word=id2word,
chunksize=chunksize,
alpha='auto',
eta='auto',
iterations=iterations,
num_topics=num_topics,
passes=passes,
eval_every=eval_every
)
top_topics = model.top_topics(corpus) #, num_words=20)
if verbose:
# Average topic coherence is the sum of topic coherences of all topics, divided by the number of topics.
avg_topic_coherence = sum([t[1] for t in top_topics]) / num_topics
print('Average topic coherence: %.4f.' % avg_topic_coherence)
from pprint import pprint
pprint(top_topics)
return model, dictionary
model = build_lda(docs, verbose = True)
Number of unique tokens: 5586 Number of documents: 10000 Average topic coherence: -1.0234. [([(0.014194508, 'like'), (0.013051799, 'so'), (0.012601433, 'just'), (0.012122028, 'if'), (0.011540861, 'watch'), (0.01010119, 'bad'), (0.010069379, 'get'), (0.010057744, 'good'), (0.009647687, 'about'), (0.009551709, 'can'), (0.009293668, 'what'), (0.00914493, 'there'), (0.009115378, 'think'), (0.00883015, 'or'), (0.008490109, 'really'), (0.007956818, 'go'), (0.0076453765, 'out'), (0.0074842023, 'would'), (0.007258887, 'show'), (0.0072472496, 'time')], -0.8598190694938721), ([(0.009607379, 'by'), (0.009005431, 'there'), (0.008550459, 'scene'), (0.008452919, 'from'), (0.008289491, 'get'), (0.008067865, 'out'), (0.007894481, 'an'), (0.0075486875, 'up'), (0.007393407, 'some'), (0.006928899, 'who'), (0.0067431354, 'like'), (0.0060857395, 'horror'), (0.00595472, 'look'), (0.0057005603, 'bad'), (0.004938874, 'into'), (0.0049017216, 'or'), (0.0048546153, 'just'), (0.00475725, 'go'), (0.004574043, 'when'), (0.004444536, 'so')], -0.8864323809271726), ([(0.012413547, 'character'), (0.010294565, 'very'), (0.0101477625, 'an'), (0.009552342, 'well'), (0.009424681, 'story'), (0.008779539, 'good'), (0.008513452, 'more'), (0.007667207, 'from'), (0.0075618154, 'which'), (0.0071534547, 'by'), (0.0063326936, 'some'), (0.005804985, 'than'), (0.0054404843, 'time'), (0.0054208725, 'great'), (0.0051299385, 'much'), (0.00505114, 'most'), (0.0050019473, 'scene'), (0.0048426916, 'also'), (0.0047662756, 'work'), (0.0047206534, 'so')], -0.9802034550143207), ([(0.013494943, 'by'), (0.013010925, 'who'), (0.010107275, 'an'), (0.009476522, 'from'), (0.0074381386, 'man'), (0.0062750974, 'about'), (0.006242952, 'life'), (0.005799123, 'when'), (0.005663077, 'or'), (0.005134062, 'what'), (0.0047394573, 'take'), (0.0047339555, 'up'), (0.0047023008, 'out'), (0.0044094766, 'which'), (0.0043049105, 'into'), (0.00418093, 'other'), (0.004039508, 'woman'), (0.0039645582, 'there'), (0.003783309, 'can'), (0.003727606, 'where')], -0.9967430542962419), ([(0.016012061, 'play'), (0.014308403, 'who'), (0.013269402, 'by'), (0.011499016, 'great'), (0.0105718635, 'good'), (0.010016862, 'love'), (0.0083144875, 'role'), (0.0074155, 'star'), (0.0070379716, 'when'), (0.0068931784, 'an'), (0.006888408, 'performance'), (0.006430589, 'music'), (0.0061867475, 'year'), (0.005748475, 'also'), (0.0057375403, 'song'), (0.0055810153, 'from'), (0.0054158717, 'John'), (0.005139664, 'time'), (0.005079874, 'old'), (0.005047708, 'up')], -1.3937241970331427)]
Input:
Output:
class SimpleSearchEngine:
def __init__(self, dimension = 100, topics = 10):
self.IDF = {}
self.dimension = dimension
self.topics = topics
def fit(self, docs, min_df = 1):
self.IDF = fit_IDF(docs, min_df)
self.w2v = fit_w2v(docs, self.dimension)
self.lda, self.dictionary = build_lda(docs, num_topics = self.topics)
def retrieve_ranklist(self, query, docs, weight_tfidf = 0.8, weight_word2vec = 0.1, weight_lda = 0.1):
parsed = pre_process(query)
order = []
for (idx, raw_text, doc) in docs:
sim = self.get_similarity(doc, parsed, weight_tfidf, weight_word2vec, weight_lda)
order.append((idx, raw_text, sim))
order.sort(key = lambda x : x[2], reverse = True)
return order
'''
This function returns the cosine similarity between doc and query based on
a linear combination of three similarity scores
doc: It is assumed to be parsed by Spacy
query: It is assumed to be parsed by Spacy
'''
def get_similarity(self, doc, query, weight_tfidf, weight_word2vec, weight_lda):
a = self.get_tfidf(doc)
b = self.get_tfidf(query)
sim_tfidf = 0
for token in a:
if token in b:
sim_tfidf += a[token] * b[token]
a = self.get_word2vec(doc)
b = self.get_word2vec(query)
sim_word2vec = np.dot(a, b) / LA.norm(a) / LA.norm(b)
a = self.get_topics(doc)
b = self.get_topics(query)
sim_lda = np.dot(a, b) / LA.norm(a) / LA.norm(b)
ret = sim_tfidf * weight_tfidf + sim_word2vec * weight_word2vec + sim_lda * weight_lda
return ret
'''
This function returns a sparse, normalized TF-IDF vector using default dictionary
doc: It is assumed to be parsed by Spacy
'''
def get_tfidf(self, doc):
ret = defaultdict(float)
for token in doc:
if token.lemma_ in self.IDF:
ret[token.lemma_] += self.IDF[token.lemma_]
for token in ret:
ret[token] /= len(doc)
s = 0
for token, w in ret.items():
s += w * w
s = sqrt(s)
for token in ret:
ret[token] /= s
return ret
'''
This function returns a dense, low-dimensional vector using a word2vec model trained during fit()
doc: It is assumed to be parsed by Spacy
'''
def get_word2vec(self, doc):
vec = np.zeros(self.dimension)
total_weight = 0
for token in doc:
if token.lemma_ in self.IDF:
try:
weight = self.IDF[token.lemma_]
vec = vec + weight * self.w2v[token.lemma_]
total_weight += weight
except KeyError:
pass
if total_weight > 0:
vec = vec / total_weight
return vec
'''
This function returns a dense, low-dimensional vector using a LDA model trained during fit()
doc: It is assumed to be parsed by Spacy
'''
def get_topics(self, doc):
tokens = []
for token in doc:
tokens.append(token.lemma_)
bow = self.dictionary.doc2bow(tokens)
t = self.lda.get_document_topics(bow)
ret = np.zeros(self.topics)
for (index, value) in t:
ret[index] += value
return ret
engine = SimpleSearchEngine()
engine.fit(docs)
# of docs = 10000 # of words in index: 54046 # of sentences for word2vec = 153836 traind: Word2Vec(vocab=54321, size=100, alpha=0.1) Number of unique tokens: 5586 Number of documents: 10000
query = 'impressive classical romantic movie'
ranks = engine.retrieve_ranklist(query, docs, weight_tfidf = 0, weight_lda = 1, weight_word2vec = 0)
print('Query =', query)
print('---------------------------------------------------------')
print('Similarity\tID\tDescription')
for (idx, description, similarity) in ranks[:5]:
print('%.4f\t%s\t%s' % (similarity, idx, description))
print('---------------------------------------------------------\n\n')
/Users/shangjingbo/opt/anaconda3/lib/python3.7/site-packages/ipykernel_launcher.py:79: DeprecationWarning: Call to deprecated `__getitem__` (Method will be removed in 4.0.0, use self.wv.__getitem__() instead).
Query = impressive classical romantic movie --------------------------------------------------------- Similarity ID Description 0.9955 4691 This film made John Glover a star. Alan Raimy is one of the most compelling character that I have ever seen on film. And I mean that sport. 0.9941 6071 Wonderful movie. Adult content. Lots of erotic scenes plus excellent music and dance scenes. My wife and I absolutely loved this movie and wish they'd make more like it. 0.9920 7015 Yes, it's not a great cinematic achievement, but Toy Soldiers is a fun and entertaining movie. The young cast does a great job with both dramatic and comedic aspects of the story, and I particularly liked Shawn Phelan as Derek/"Yogurt". I've seen this one plenty of times over the years, and will probably see it several more. Just don't think too much and you'll love it - enjoy! 0.9899 3045 Well, I have not much to say about this film except that it was a truly wonderful film. Natalie Portman is absolutely fantastic as the daughter in this lovely mother-daughter relationship film. <br /><br />Beautiful film. 0.9892 253 This film is brilliant it has cute little dolphins in it and its a great storyline and it has elijah wood in it which makes it a great film too. his acting skills are very good and if you want a good soft family film. this is the one to watch. ---------------------------------------------------------