import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
import nltk
nltk.download('punkt') # downloads you a model
nltk.download('stopwords') # <--- this is new
from nltk.corpus import stopwords
from gensim.test.utils import datapath
from gensim import utils
import gensim.models

[nltk_data] Downloading package punkt to /home/jovyan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/jovyan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


dataset=pd.read_csv("data/AutoPhrase.txt",delimiter="\t", header=None)


dataset


dataset.columns = ["score", "phrase"]
dataset.head(30)


dataset.tail(30)


df =pd.read_csv("data/segmentation.txt",delimiter="\t", header=None)


df.columns = ['phrase']


def process(str):
    return str.replace(' ',',').replace('_',' ').lower()
df_ = df.loc[:,'phrase'].astype(str).apply(process).reset_index()


df_.head(20)


def preprocess_df(df):
    # get English stopwords
    stop_words = set(stopwords.words('english'))
    stop_words.add('would')
    # prepare translation table to translate punctuation to space
    preprocessed_sentences = []
    for i, row in df.iterrows():
        sent = row["phrase"]
        words_list = sent.strip().split(',')
        filtered_words = [word for word in words_list if word not in stop_words and len(word) != 1] # also skip space from above translation
        preprocessed_sentences.append(filtered_words)
    df["phrase"] = preprocessed_sentences
    return df


processed_data_ = preprocess_df(df_.copy())


tagged_data = [_d for i, _d in enumerate(processed_data_["phrase"])]
tagged_data[:10]

[['oql', 'c++', 'extending', 'c++'],
 ['transaction management', 'multidatabase systems'],
 ['overview'],
 ['multimedia', 'information'],
 ['active', 'database systems'],
 ['object-oriented', 'dbmss', 'early'],
 ['distributed', 'databases'],
 ['an object-oriented',
  'dbms',
  'war',
  'story',
  'developing',
  'genome',
  'mapping',
  'database',
  'c++'],
 ['cooperative', 'multiuser'],
 ['architecture', 'multidatabase']]


from gensim.models import word2vec
corpus = df_.phrase
w2v = word2vec.Word2Vec(sentences = tagged_data, min_count=1)


w2v.wv.most_similar('computer science', topn=10)

[('curriculum', 0.966228723526001),
 ('mathematics', 0.9508180022239685),
 ('undergraduate', 0.9425908327102661),
 ('graduate', 0.9304831624031067),
 ('science students', 0.9252045154571533),
 ('clemson', 0.9201644062995911),
 ('education', 0.9200882911682129),
 ('liberal arts college', 0.9195355176925659),
 ('se', 0.9188847541809082),
 ('baccalaureate degree', 0.9187815189361572)]


w2v.wv.most_similar('resource management', topn=10)

[('resource sharing', 0.9670305848121643),
 ('broker', 0.9609232544898987),
 ('highly dynamic', 0.9591811895370483),
 ('network management', 0.9565896987915039),
 ('in grid', 0.9546797275543213),
 ('workload migration', 0.9526695013046265),
 ('mage', 0.9520704746246338),
 ('computational grids', 0.9516937732696533),
 ('decentralised', 0.9509827494621277),
 ('dependable', 0.9496684074401855)]


w2v.wv.most_similar('natural language processing', topn=10)

[('nlp', 0.9904453158378601),
 ('lexicon', 0.9455552101135254),
 ('question answering', 0.9449619650840759),
 ('multilingual', 0.9350264072418213),
 ('vocabulary', 0.9333145618438721),
 ('information extraction', 0.9230526685714722),
 ('text mining', 0.9228798747062683),
 ('textual', 0.9207432866096497),
 ('domain-specific', 0.9194395542144775),
 ('frame elements', 0.9169366359710693)]


w2v.wv.most_similar('performance evaluation', topn=10)

[('performance analysis', 0.970462441444397),
 ('noc', 0.9287059307098389),
 ('emulation', 0.9172960519790649),
 ('mpich2', 0.9130640029907227),
 ('atm', 0.9092670679092407),
 ('spidergon', 0.9073224067687988),
 ('network-on-chip', 0.9070577621459961),
 ('intrusion prevention system', 0.9067220687866211),
 ('nios', 0.9061682224273682),
 ('response time', 0.9059284925460815)]


w2v.wv.most_similar('data structure', topn=10)

[('suffix tree', 0.964256227016449),
 ('cube', 0.9633463025093079),
 ('partitions', 0.9605334401130676),
 ('binary search', 0.9571460485458374),
 ('trie', 0.9560410976409912),
 ('suffix array', 0.9552429914474487),
 ('sorting', 0.9537972807884216),
 ('deletion', 0.9518201947212219),
 ('space partition', 0.9506819248199463),
 ('tree structure', 0.949249804019928)]


w2v.wv.most_similar('artificial intelligence', topn=10)

[('ai', 0.9871581196784973),
 ('expert systems', 0.9188097715377808),
 ('blended learning', 0.9132139086723328),
 ('related areas', 0.8982245922088623),
 ('vuelta', 0.897810161113739),
 ('article investigates', 0.8968126773834229),
 ('multidisciplinary research', 0.8900700807571411),
 ('thinking', 0.8880915641784668),
 ('interdisciplinary', 0.8866879940032959),
 ('emo', 0.8836549520492554)]

	0	1
0	0.987412	density estimation
1	0.985778	inverse kinematics
2	0.985462	gröbner bases
3	0.984563	amdahl's law
4	0.983978	mutual information
...	...	...
720233	0.004385	this algorithm does not
720234	0.004292	a programming paradigm for
720235	0.004292	an adaptive method of
720236	0.004292	an integrated database for
720237	0.003529	and more widely used

	score	phrase
0	0.987412	density estimation
1	0.985778	inverse kinematics
2	0.985462	gröbner bases
3	0.984563	amdahl's law
4	0.983978	mutual information
5	0.983851	lecture notes
6	0.983412	naïve bayes
7	0.983329	texas instruments
8	0.983263	bundle adjustment
9	0.983129	computerized tomography
10	0.983021	chronic diseases
11	0.982538	motif discovery
12	0.982517	signaling pathway
13	0.982216	disaster recovery
14	0.982161	nuclear magnetic resonance
15	0.982121	insider threats
16	0.982113	finitely generated
17	0.981962	partially observable
18	0.981710	reinforcement learning
19	0.981573	biped locomotion
20	0.981407	combinatory categorial grammar
21	0.981361	tablet pcs
22	0.981327	south korea
23	0.981282	xilinx xc4000
24	0.981279	alzheimer's disease
25	0.981279	river basin
26	0.981236	universally composable
27	0.981046	south africa
28	0.981026	vickrey auctions
29	0.980971	confocal microscopy

	score	phrase
720208	0.005365	for systems described by
720209	0.005358	an alternative technique for
720210	0.005358	an efficient processing of
720211	0.005348	the technique allows for
720212	0.005348	a lens for
720213	0.005308	the modular decomposition of
720214	0.005308	a significant speedup for
720215	0.005308	a small area of
720216	0.005308	a layered approach for
720217	0.005303	to provide flexibility in
720218	0.005294	a converter for
720219	0.005292	a powerful method of
720220	0.005292	a high degree of confidence in
720221	0.005292	the equilibrium point for
720222	0.005292	a classification scheme of
720223	0.005292	a big problem in
720224	0.005292	a hybrid solution for
720225	0.005292	a solution approach for
720226	0.005271	a fundamental result in
720227	0.005271	a critical requirement in
720228	0.005271	the algorithm works with
720229	0.005271	the main bottleneck of
720230	0.005259	and more important for
720231	0.005248	a preparation for
720232	0.004871	an efficient test for
720233	0.004385	this algorithm does not
720234	0.004292	a programming paradigm for
720235	0.004292	an adaptive method of
720236	0.004292	an integrated database for
720237	0.003529	and more widely used

Question A¶

For low score phrase, we can detect there is a common pattern which is they all have the preposition in it, for example [for, of, in, and], all these words are meant to be a non-noun phrase and shouldn't be in the list, in fact, Autophrase might give a lesser weight of POS label¶

Question B¶

Question C¶

Discussion¶

There are some interesting observations that the highest similarity word is actually the abbreviation of itself, for example artificial intelligence(ai), natural language processing(nlp).¶

Some words such as pedagogical has a slightly higher score than its root word pedagogy, adjective has a slightly higher score than noun. Furthermore, the most_similar word of performance evaluation is performance analysis, which looks pretty good as they are synonym.¶

	index	phrase
0	0	oql,c++,extending,c++
1	1	transaction management,multidatabase systems
2	2	overview
3	3	multimedia,information
4	4	active,database systems
5	5	object-oriented,dbmss,early
6	6	distributed,databases
7	7	an object-oriented,dbms,war,story,developing,g...
8	8	cooperative,multiuser
9	9	architecture,multidatabase
10	10	physical object,management
11	11	introduction,next-generation,database,technology
12	12	object-oriented,database systems,reality
13	13	introduction,technology,interoperating,legacy ...
14	14	resolving,schematic,multidatabase systems
15	15	performance benchmark,object-oriented,database...
16	16	object-oriented,databases
17	17	solution,managing,e,p,data
18	18	c++,object database
19	19	authorization,object-oriented,databases