import pandas as pd

df = pd.read_csv('movie_data.csv')

df.head(10)


corpus = list(df.review)[:10000]


import re
import en_core_web_sm

nlp = en_core_web_sm.load()

def pre_process(s):
    s = re.sub(r'\W+', ' ', s)
    return nlp(s)

pre_process('This is a simple preprocessing-function.')

This is a simple preprocessing function


from tqdm import tqdm

docs = []
for idx, review in tqdm(enumerate(corpus), total = len(corpus)):
    doc = pre_process(review)
    
    docs.append((idx, review, doc))

100%|██████████| 10000/10000 [04:55<00:00, 33.85it/s]


import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

stopword_set = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/shangjingbo/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


from collections import defaultdict
from math import log, sqrt
import numpy as np
from numpy import linalg as LA
from gensim.models import Word2Vec, LdaModel
from gensim.corpora import Dictionary


def fit_IDF(docs, min_df = 1):
    print(f'# of docs = {len(docs)}')
    DF = defaultdict(float)
    for (idx, raw_text, doc) in docs:
        token_set = set([token.lemma_ for token in doc if token.text.lower() not in stopword_set])
        for token in token_set:
            DF[token] += 1
    IDF = defaultdict(float)
    for token in DF:
        if (DF[token] >= min_df) and (DF[token] < len(docs)):
            IDF[token] = log(len(docs) / DF[token])
        else:
            print('boilerplate token =', token)
    print(f'# of words in index: {len(IDF)}')
    return IDF

IDF = fit_IDF(docs)

# of docs = 10000
# of words in index: 54046


def fit_w2v(docs, dimension):
    sentences = []
    for (idx, raw_text, doc) in docs:
        for sent in doc.sents:
            sentence = []
            for token in sent:
                sentence.append(token.lemma_)
            sentences.append(sentence)

    w2v_model = Word2Vec(min_count=1,
                         window=5,
                         size=dimension,
                         sample=6e-5, 
                         alpha=0.1, 
                         min_alpha=0.0007, 
                         negative=20)
    print(f'# of sentences for word2vec = {len(sentences)}')
    w2v_model.build_vocab(sentences, progress_per=10000)
    w2v_model.train(sentences, total_examples=w2v_model.corpus_count, epochs=50, report_delay=1)
    print(f'traind: {w2v_model}')
    return w2v_model

w2v = fit_w2v(docs, 100)
print(w2v.similarity('good', 'bad'))
print(w2v.similarity('good', 'great'))

# of sentences for word2vec = 153836
traind: Word2Vec(vocab=54321, size=100, alpha=0.1)
0.6447691
0.757915

/Users/shangjingbo/opt/anaconda3/lib/python3.7/site-packages/ipykernel_launcher.py:24: DeprecationWarning: Call to deprecated `similarity` (Method will be removed in 4.0.0, use self.wv.similarity() instead).
/Users/shangjingbo/opt/anaconda3/lib/python3.7/site-packages/ipykernel_launcher.py:25: DeprecationWarning: Call to deprecated `similarity` (Method will be removed in 4.0.0, use self.wv.similarity() instead).


'''
    docs are supposed to be a list of Spacy-parsed documents.
'''
def build_lda(docs, num_topics = 5, verbose = False):
    corpus = []
    for (idx, review, doc) in docs:
        tokenized_doc = []
        for token in doc:
            tokenized_doc.append(token.lemma_)
        corpus.append(tokenized_doc)
    # Create a dictionary representation of the documents.
    dictionary = Dictionary(corpus)

    # Filter out words that occur less than 20 documents, or more than 50% of the documents.
    dictionary.filter_extremes(no_below=20, no_above=0.5)
    
    # Bag-of-words representation of the documents.
    corpus = [dictionary.doc2bow(doc) for doc in corpus]
    
    print('Number of unique tokens: %d' % len(dictionary))
    print('Number of documents: %d' % len(corpus))
    
    # Set training parameters.
    chunksize = 2000
    passes = 20
    iterations = 400
    eval_every = None  # Don't evaluate model perplexity, takes too much time.
    
    # Make a index to word dictionary.
    temp = dictionary[0]  # This is only to "load" the dictionary.
    id2word = dictionary.id2token

    model = LdaModel(
        corpus=corpus,
        id2word=id2word,
        chunksize=chunksize,
        alpha='auto',
        eta='auto',
        iterations=iterations,
        num_topics=num_topics,
        passes=passes,
        eval_every=eval_every
    )
    
    top_topics = model.top_topics(corpus) #, num_words=20)
    
    if verbose:
        # Average topic coherence is the sum of topic coherences of all topics, divided by the number of topics.
        avg_topic_coherence = sum([t[1] for t in top_topics]) / num_topics
        print('Average topic coherence: %.4f.' % avg_topic_coherence)

        from pprint import pprint
        pprint(top_topics)

    return model, dictionary

    
model = build_lda(docs, verbose = True)

Number of unique tokens: 5586
Number of documents: 10000
Average topic coherence: -1.0234.
[([(0.014194508, 'like'),
   (0.013051799, 'so'),
   (0.012601433, 'just'),
   (0.012122028, 'if'),
   (0.011540861, 'watch'),
   (0.01010119, 'bad'),
   (0.010069379, 'get'),
   (0.010057744, 'good'),
   (0.009647687, 'about'),
   (0.009551709, 'can'),
   (0.009293668, 'what'),
   (0.00914493, 'there'),
   (0.009115378, 'think'),
   (0.00883015, 'or'),
   (0.008490109, 'really'),
   (0.007956818, 'go'),
   (0.0076453765, 'out'),
   (0.0074842023, 'would'),
   (0.007258887, 'show'),
   (0.0072472496, 'time')],
  -0.8598190694938721),
 ([(0.009607379, 'by'),
   (0.009005431, 'there'),
   (0.008550459, 'scene'),
   (0.008452919, 'from'),
   (0.008289491, 'get'),
   (0.008067865, 'out'),
   (0.007894481, 'an'),
   (0.0075486875, 'up'),
   (0.007393407, 'some'),
   (0.006928899, 'who'),
   (0.0067431354, 'like'),
   (0.0060857395, 'horror'),
   (0.00595472, 'look'),
   (0.0057005603, 'bad'),
   (0.004938874, 'into'),
   (0.0049017216, 'or'),
   (0.0048546153, 'just'),
   (0.00475725, 'go'),
   (0.004574043, 'when'),
   (0.004444536, 'so')],
  -0.8864323809271726),
 ([(0.012413547, 'character'),
   (0.010294565, 'very'),
   (0.0101477625, 'an'),
   (0.009552342, 'well'),
   (0.009424681, 'story'),
   (0.008779539, 'good'),
   (0.008513452, 'more'),
   (0.007667207, 'from'),
   (0.0075618154, 'which'),
   (0.0071534547, 'by'),
   (0.0063326936, 'some'),
   (0.005804985, 'than'),
   (0.0054404843, 'time'),
   (0.0054208725, 'great'),
   (0.0051299385, 'much'),
   (0.00505114, 'most'),
   (0.0050019473, 'scene'),
   (0.0048426916, 'also'),
   (0.0047662756, 'work'),
   (0.0047206534, 'so')],
  -0.9802034550143207),
 ([(0.013494943, 'by'),
   (0.013010925, 'who'),
   (0.010107275, 'an'),
   (0.009476522, 'from'),
   (0.0074381386, 'man'),
   (0.0062750974, 'about'),
   (0.006242952, 'life'),
   (0.005799123, 'when'),
   (0.005663077, 'or'),
   (0.005134062, 'what'),
   (0.0047394573, 'take'),
   (0.0047339555, 'up'),
   (0.0047023008, 'out'),
   (0.0044094766, 'which'),
   (0.0043049105, 'into'),
   (0.00418093, 'other'),
   (0.004039508, 'woman'),
   (0.0039645582, 'there'),
   (0.003783309, 'can'),
   (0.003727606, 'where')],
  -0.9967430542962419),
 ([(0.016012061, 'play'),
   (0.014308403, 'who'),
   (0.013269402, 'by'),
   (0.011499016, 'great'),
   (0.0105718635, 'good'),
   (0.010016862, 'love'),
   (0.0083144875, 'role'),
   (0.0074155, 'star'),
   (0.0070379716, 'when'),
   (0.0068931784, 'an'),
   (0.006888408, 'performance'),
   (0.006430589, 'music'),
   (0.0061867475, 'year'),
   (0.005748475, 'also'),
   (0.0057375403, 'song'),
   (0.0055810153, 'from'),
   (0.0054158717, 'John'),
   (0.005139664, 'time'),
   (0.005079874, 'old'),
   (0.005047708, 'up')],
  -1.3937241970331427)]


class SimpleSearchEngine:
    
    def __init__(self, dimension = 100, topics = 10):
        self.IDF = {}
        self.dimension = dimension
        self.topics = topics
        
    def fit(self, docs, min_df = 1):
        self.IDF = fit_IDF(docs, min_df)
        self.w2v = fit_w2v(docs, self.dimension)
        self.lda, self.dictionary = build_lda(docs, num_topics = self.topics)

    def retrieve_ranklist(self, query, docs, weight_tfidf = 0.8, weight_word2vec = 0.1, weight_lda = 0.1):
        parsed = pre_process(query)
        order = []
        for (idx, raw_text, doc) in docs:        
            sim = self.get_similarity(doc, parsed, weight_tfidf, weight_word2vec, weight_lda)
            order.append((idx, raw_text, sim))
        order.sort(key = lambda x : x[2], reverse = True)
        return order
    
    '''
        This function returns the cosine similarity between doc and query based on 
        a linear combination of three similarity scores
        
        doc: It is assumed to be parsed by Spacy
        query: It is assumed to be parsed by Spacy
    '''
    def get_similarity(self, doc, query, weight_tfidf, weight_word2vec, weight_lda):
        a = self.get_tfidf(doc)
        b = self.get_tfidf(query)
        sim_tfidf = 0
        for token in a:
            if token in b:
                sim_tfidf += a[token] * b[token]
                
        a = self.get_word2vec(doc)
        b = self.get_word2vec(query)
        sim_word2vec = np.dot(a, b) / LA.norm(a) / LA.norm(b)
        
        a = self.get_topics(doc)
        b = self.get_topics(query)
        sim_lda = np.dot(a, b) / LA.norm(a) / LA.norm(b)

        ret = sim_tfidf * weight_tfidf + sim_word2vec * weight_word2vec + sim_lda * weight_lda
        return ret  
    
        
    '''
        This function returns a sparse, normalized TF-IDF vector using default dictionary
        doc: It is assumed to be parsed by Spacy
    '''
    def get_tfidf(self, doc): 
        ret = defaultdict(float)
        for token in doc:
            if token.lemma_ in self.IDF:
                ret[token.lemma_] += self.IDF[token.lemma_]
        for token in ret:
            ret[token] /= len(doc)
        s = 0
        for token, w in ret.items():
            s += w * w
        s = sqrt(s)
        for token in ret:
            ret[token] /= s
        return ret
    
    '''
        This function returns a dense, low-dimensional vector using a word2vec model trained during fit()
        doc: It is assumed to be parsed by Spacy
    '''
    def get_word2vec(self, doc):
        vec = np.zeros(self.dimension)
        total_weight = 0
        for token in doc:
            if token.lemma_ in self.IDF:
                try:
                    weight = self.IDF[token.lemma_]
                    vec = vec + weight * self.w2v[token.lemma_]
                    total_weight += weight
                except KeyError:
                    pass
        if total_weight > 0:
            vec = vec / total_weight
        return vec
    
    '''
        This function returns a dense, low-dimensional vector using a LDA model trained during fit()
        doc: It is assumed to be parsed by Spacy
    '''
    def get_topics(self, doc):
        tokens = []
        for token in doc:
            tokens.append(token.lemma_)
        bow = self.dictionary.doc2bow(tokens)
        t = self.lda.get_document_topics(bow)
        ret = np.zeros(self.topics)
        for (index, value) in t:
            ret[index] += value
        return ret


engine = SimpleSearchEngine()
engine.fit(docs)

# of docs = 10000
# of words in index: 54046
# of sentences for word2vec = 153836
traind: Word2Vec(vocab=54321, size=100, alpha=0.1)
Number of unique tokens: 5586
Number of documents: 10000


query = 'impressive classical romantic movie'
ranks = engine.retrieve_ranklist(query, docs, weight_tfidf = 0, weight_lda = 1, weight_word2vec = 0)

print('Query =', query)
print('---------------------------------------------------------')
print('Similarity\tID\tDescription')
for (idx, description, similarity) in ranks[:5]:
    print('%.4f\t%s\t%s' % (similarity, idx, description))
print('---------------------------------------------------------\n\n')

/Users/shangjingbo/opt/anaconda3/lib/python3.7/site-packages/ipykernel_launcher.py:79: DeprecationWarning: Call to deprecated `__getitem__` (Method will be removed in 4.0.0, use self.wv.__getitem__() instead).

Query = impressive classical romantic movie
---------------------------------------------------------
Similarity	ID	Description
0.9955	4691	This film made John Glover a star. Alan Raimy is one of the most compelling character that I have ever seen on film. And I mean that sport.
0.9941	6071	Wonderful movie. Adult content. Lots of erotic scenes plus excellent music and dance scenes. My wife and I absolutely loved this movie and wish they'd make more like it.
0.9920	7015	Yes, it's not a great cinematic achievement, but Toy Soldiers is a fun and entertaining movie. The young cast does a great job with both dramatic and comedic aspects of the story, and I particularly liked Shawn Phelan as Derek/"Yogurt". I've seen this one plenty of times over the years, and will probably see it several more. Just don't think too much and you'll love it - enjoy!
0.9899	3045	Well, I have not much to say about this film except that it was a truly wonderful film. Natalie Portman is absolutely fantastic as the daughter in this lovely mother-daughter relationship film. <br /><br />Beautiful film.
0.9892	253	This film is brilliant it has cute little dolphins in it and its a great storyline and it has elijah wood in it which makes it a great film too. his acting skills are very good and if you want a good soft family film. this is the one to watch.
---------------------------------------------------------

	review	sentiment
0	In 1974, the teenager Martha Moxley (Maggie Gr...	1
1	OK... so... I really like Kris Kristofferson a...	0
2	*SPOILER* Do not read this, if you think a...	0
3	hi for all the people who have seen this wonde...	1
4	I recently bought the DVD, forgetting just how...	0
5	Leave it to Braik to put on a good show. Final...	1
6	Nathan Detroit (Frank Sinatra) is the manager ...	1
7	To understand "Crash Course" in the right cont...	1
8	I've been impressed with Chavez's stance again...	1
9	This movie is directed by Renny Harlin the fin...	1

Load the movie review dataset and do pre-processing by Spacy¶

Fit IDF based on all the docs that we have¶

Fit a word2vec model based on all the docs that we have¶

Fit a LDA model based on all the docs that we have¶

Let's assemble all these together and build a very simple search engine!¶

Build a search engine for all reviews¶