import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
import nltk
nltk.download('punkt') # downloads you a model
nltk.download('stopwords') # <--- this is new
from nltk.corpus import stopwords
from gensim.test.utils import datapath
from gensim import utils
import gensim.models
[nltk_data] Downloading package punkt to /home/jovyan/nltk_data... [nltk_data] Package punkt is already up-to-date! [nltk_data] Downloading package stopwords to /home/jovyan/nltk_data... [nltk_data] Package stopwords is already up-to-date!
dataset=pd.read_csv("data/AutoPhrase.txt",delimiter="\t", header=None)
dataset
0 | 1 | |
---|---|---|
0 | 0.987412 | density estimation |
1 | 0.985778 | inverse kinematics |
2 | 0.985462 | gröbner bases |
3 | 0.984563 | amdahl's law |
4 | 0.983978 | mutual information |
... | ... | ... |
720233 | 0.004385 | this algorithm does not |
720234 | 0.004292 | a programming paradigm for |
720235 | 0.004292 | an adaptive method of |
720236 | 0.004292 | an integrated database for |
720237 | 0.003529 | and more widely used |
720238 rows × 2 columns
dataset.columns = ["score", "phrase"]
dataset.head(30)
score | phrase | |
---|---|---|
0 | 0.987412 | density estimation |
1 | 0.985778 | inverse kinematics |
2 | 0.985462 | gröbner bases |
3 | 0.984563 | amdahl's law |
4 | 0.983978 | mutual information |
5 | 0.983851 | lecture notes |
6 | 0.983412 | naïve bayes |
7 | 0.983329 | texas instruments |
8 | 0.983263 | bundle adjustment |
9 | 0.983129 | computerized tomography |
10 | 0.983021 | chronic diseases |
11 | 0.982538 | motif discovery |
12 | 0.982517 | signaling pathway |
13 | 0.982216 | disaster recovery |
14 | 0.982161 | nuclear magnetic resonance |
15 | 0.982121 | insider threats |
16 | 0.982113 | finitely generated |
17 | 0.981962 | partially observable |
18 | 0.981710 | reinforcement learning |
19 | 0.981573 | biped locomotion |
20 | 0.981407 | combinatory categorial grammar |
21 | 0.981361 | tablet pcs |
22 | 0.981327 | south korea |
23 | 0.981282 | xilinx xc4000 |
24 | 0.981279 | alzheimer's disease |
25 | 0.981279 | river basin |
26 | 0.981236 | universally composable |
27 | 0.981046 | south africa |
28 | 0.981026 | vickrey auctions |
29 | 0.980971 | confocal microscopy |
dataset.tail(30)
score | phrase | |
---|---|---|
720208 | 0.005365 | for systems described by |
720209 | 0.005358 | an alternative technique for |
720210 | 0.005358 | an efficient processing of |
720211 | 0.005348 | the technique allows for |
720212 | 0.005348 | a lens for |
720213 | 0.005308 | the modular decomposition of |
720214 | 0.005308 | a significant speedup for |
720215 | 0.005308 | a small area of |
720216 | 0.005308 | a layered approach for |
720217 | 0.005303 | to provide flexibility in |
720218 | 0.005294 | a converter for |
720219 | 0.005292 | a powerful method of |
720220 | 0.005292 | a high degree of confidence in |
720221 | 0.005292 | the equilibrium point for |
720222 | 0.005292 | a classification scheme of |
720223 | 0.005292 | a big problem in |
720224 | 0.005292 | a hybrid solution for |
720225 | 0.005292 | a solution approach for |
720226 | 0.005271 | a fundamental result in |
720227 | 0.005271 | a critical requirement in |
720228 | 0.005271 | the algorithm works with |
720229 | 0.005271 | the main bottleneck of |
720230 | 0.005259 | and more important for |
720231 | 0.005248 | a preparation for |
720232 | 0.004871 | an efficient test for |
720233 | 0.004385 | this algorithm does not |
720234 | 0.004292 | a programming paradigm for |
720235 | 0.004292 | an adaptive method of |
720236 | 0.004292 | an integrated database for |
720237 | 0.003529 | and more widely used |
df =pd.read_csv("data/segmentation.txt",delimiter="\t", header=None)
df.columns = ['phrase']
def process(str):
return str.replace(' ',',').replace('_',' ').lower()
df_ = df.loc[:,'phrase'].astype(str).apply(process).reset_index()
df_.head(20)
index | phrase | |
---|---|---|
0 | 0 | oql,c++,extending,c++ |
1 | 1 | transaction management,multidatabase systems |
2 | 2 | overview |
3 | 3 | multimedia,information |
4 | 4 | active,database systems |
5 | 5 | object-oriented,dbmss,early |
6 | 6 | distributed,databases |
7 | 7 | an object-oriented,dbms,war,story,developing,g... |
8 | 8 | cooperative,multiuser |
9 | 9 | architecture,multidatabase |
10 | 10 | physical object,management |
11 | 11 | introduction,next-generation,database,technology |
12 | 12 | object-oriented,database systems,reality |
13 | 13 | introduction,technology,interoperating,legacy ... |
14 | 14 | resolving,schematic,multidatabase systems |
15 | 15 | performance benchmark,object-oriented,database... |
16 | 16 | object-oriented,databases |
17 | 17 | solution,managing,e,p,data |
18 | 18 | c++,object database |
19 | 19 | authorization,object-oriented,databases |
def preprocess_df(df):
# get English stopwords
stop_words = set(stopwords.words('english'))
stop_words.add('would')
# prepare translation table to translate punctuation to space
preprocessed_sentences = []
for i, row in df.iterrows():
sent = row["phrase"]
words_list = sent.strip().split(',')
filtered_words = [word for word in words_list if word not in stop_words and len(word) != 1] # also skip space from above translation
preprocessed_sentences.append(filtered_words)
df["phrase"] = preprocessed_sentences
return df
processed_data_ = preprocess_df(df_.copy())
tagged_data = [_d for i, _d in enumerate(processed_data_["phrase"])]
tagged_data[:10]
[['oql', 'c++', 'extending', 'c++'], ['transaction management', 'multidatabase systems'], ['overview'], ['multimedia', 'information'], ['active', 'database systems'], ['object-oriented', 'dbmss', 'early'], ['distributed', 'databases'], ['an object-oriented', 'dbms', 'war', 'story', 'developing', 'genome', 'mapping', 'database', 'c++'], ['cooperative', 'multiuser'], ['architecture', 'multidatabase']]
from gensim.models import word2vec
corpus = df_.phrase
w2v = word2vec.Word2Vec(sentences = tagged_data, min_count=1)
w2v.wv.most_similar('computer science', topn=10)
[('curriculum', 0.966228723526001), ('mathematics', 0.9508180022239685), ('undergraduate', 0.9425908327102661), ('graduate', 0.9304831624031067), ('science students', 0.9252045154571533), ('clemson', 0.9201644062995911), ('education', 0.9200882911682129), ('liberal arts college', 0.9195355176925659), ('se', 0.9188847541809082), ('baccalaureate degree', 0.9187815189361572)]
w2v.wv.most_similar('resource management', topn=10)
[('resource sharing', 0.9670305848121643), ('broker', 0.9609232544898987), ('highly dynamic', 0.9591811895370483), ('network management', 0.9565896987915039), ('in grid', 0.9546797275543213), ('workload migration', 0.9526695013046265), ('mage', 0.9520704746246338), ('computational grids', 0.9516937732696533), ('decentralised', 0.9509827494621277), ('dependable', 0.9496684074401855)]
w2v.wv.most_similar('natural language processing', topn=10)
[('nlp', 0.9904453158378601), ('lexicon', 0.9455552101135254), ('question answering', 0.9449619650840759), ('multilingual', 0.9350264072418213), ('vocabulary', 0.9333145618438721), ('information extraction', 0.9230526685714722), ('text mining', 0.9228798747062683), ('textual', 0.9207432866096497), ('domain-specific', 0.9194395542144775), ('frame elements', 0.9169366359710693)]
w2v.wv.most_similar('performance evaluation', topn=10)
[('performance analysis', 0.970462441444397), ('noc', 0.9287059307098389), ('emulation', 0.9172960519790649), ('mpich2', 0.9130640029907227), ('atm', 0.9092670679092407), ('spidergon', 0.9073224067687988), ('network-on-chip', 0.9070577621459961), ('intrusion prevention system', 0.9067220687866211), ('nios', 0.9061682224273682), ('response time', 0.9059284925460815)]
w2v.wv.most_similar('data structure', topn=10)
[('suffix tree', 0.964256227016449), ('cube', 0.9633463025093079), ('partitions', 0.9605334401130676), ('binary search', 0.9571460485458374), ('trie', 0.9560410976409912), ('suffix array', 0.9552429914474487), ('sorting', 0.9537972807884216), ('deletion', 0.9518201947212219), ('space partition', 0.9506819248199463), ('tree structure', 0.949249804019928)]
w2v.wv.most_similar('artificial intelligence', topn=10)
[('ai', 0.9871581196784973), ('expert systems', 0.9188097715377808), ('blended learning', 0.9132139086723328), ('related areas', 0.8982245922088623), ('vuelta', 0.897810161113739), ('article investigates', 0.8968126773834229), ('multidisciplinary research', 0.8900700807571411), ('thinking', 0.8880915641784668), ('interdisciplinary', 0.8866879940032959), ('emo', 0.8836549520492554)]