import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
import nltk
nltk.download('punkt') # downloads you a model
nltk.download('stopwords') # <--- this is new
from nltk.corpus import stopwords
from gensim.test.utils import datapath
from gensim import utils
import gensim.models
[nltk_data] Downloading package punkt to /home/jovyan/nltk_data... [nltk_data] Package punkt is already up-to-date! [nltk_data] Downloading package stopwords to /home/jovyan/nltk_data... [nltk_data] Package stopwords is already up-to-date!
dataset=pd.read_csv("data/AutoPhrase.txt",delimiter="\t", header=None)
dataset
| 0 | 1 | |
|---|---|---|
| 0 | 0.987412 | density estimation |
| 1 | 0.985778 | inverse kinematics |
| 2 | 0.985462 | gröbner bases |
| 3 | 0.984563 | amdahl's law |
| 4 | 0.983978 | mutual information |
| ... | ... | ... |
| 720233 | 0.004385 | this algorithm does not |
| 720234 | 0.004292 | a programming paradigm for |
| 720235 | 0.004292 | an adaptive method of |
| 720236 | 0.004292 | an integrated database for |
| 720237 | 0.003529 | and more widely used |
720238 rows × 2 columns
dataset.columns = ["score", "phrase"]
dataset.head(30)
| score | phrase | |
|---|---|---|
| 0 | 0.987412 | density estimation |
| 1 | 0.985778 | inverse kinematics |
| 2 | 0.985462 | gröbner bases |
| 3 | 0.984563 | amdahl's law |
| 4 | 0.983978 | mutual information |
| 5 | 0.983851 | lecture notes |
| 6 | 0.983412 | naïve bayes |
| 7 | 0.983329 | texas instruments |
| 8 | 0.983263 | bundle adjustment |
| 9 | 0.983129 | computerized tomography |
| 10 | 0.983021 | chronic diseases |
| 11 | 0.982538 | motif discovery |
| 12 | 0.982517 | signaling pathway |
| 13 | 0.982216 | disaster recovery |
| 14 | 0.982161 | nuclear magnetic resonance |
| 15 | 0.982121 | insider threats |
| 16 | 0.982113 | finitely generated |
| 17 | 0.981962 | partially observable |
| 18 | 0.981710 | reinforcement learning |
| 19 | 0.981573 | biped locomotion |
| 20 | 0.981407 | combinatory categorial grammar |
| 21 | 0.981361 | tablet pcs |
| 22 | 0.981327 | south korea |
| 23 | 0.981282 | xilinx xc4000 |
| 24 | 0.981279 | alzheimer's disease |
| 25 | 0.981279 | river basin |
| 26 | 0.981236 | universally composable |
| 27 | 0.981046 | south africa |
| 28 | 0.981026 | vickrey auctions |
| 29 | 0.980971 | confocal microscopy |
dataset.tail(30)
| score | phrase | |
|---|---|---|
| 720208 | 0.005365 | for systems described by |
| 720209 | 0.005358 | an alternative technique for |
| 720210 | 0.005358 | an efficient processing of |
| 720211 | 0.005348 | the technique allows for |
| 720212 | 0.005348 | a lens for |
| 720213 | 0.005308 | the modular decomposition of |
| 720214 | 0.005308 | a significant speedup for |
| 720215 | 0.005308 | a small area of |
| 720216 | 0.005308 | a layered approach for |
| 720217 | 0.005303 | to provide flexibility in |
| 720218 | 0.005294 | a converter for |
| 720219 | 0.005292 | a powerful method of |
| 720220 | 0.005292 | a high degree of confidence in |
| 720221 | 0.005292 | the equilibrium point for |
| 720222 | 0.005292 | a classification scheme of |
| 720223 | 0.005292 | a big problem in |
| 720224 | 0.005292 | a hybrid solution for |
| 720225 | 0.005292 | a solution approach for |
| 720226 | 0.005271 | a fundamental result in |
| 720227 | 0.005271 | a critical requirement in |
| 720228 | 0.005271 | the algorithm works with |
| 720229 | 0.005271 | the main bottleneck of |
| 720230 | 0.005259 | and more important for |
| 720231 | 0.005248 | a preparation for |
| 720232 | 0.004871 | an efficient test for |
| 720233 | 0.004385 | this algorithm does not |
| 720234 | 0.004292 | a programming paradigm for |
| 720235 | 0.004292 | an adaptive method of |
| 720236 | 0.004292 | an integrated database for |
| 720237 | 0.003529 | and more widely used |
df =pd.read_csv("data/segmentation.txt",delimiter="\t", header=None)
df.columns = ['phrase']
def process(str):
return str.replace(' ',',').replace('_',' ').lower()
df_ = df.loc[:,'phrase'].astype(str).apply(process).reset_index()
df_.head(20)
| index | phrase | |
|---|---|---|
| 0 | 0 | oql,c++,extending,c++ |
| 1 | 1 | transaction management,multidatabase systems |
| 2 | 2 | overview |
| 3 | 3 | multimedia,information |
| 4 | 4 | active,database systems |
| 5 | 5 | object-oriented,dbmss,early |
| 6 | 6 | distributed,databases |
| 7 | 7 | an object-oriented,dbms,war,story,developing,g... |
| 8 | 8 | cooperative,multiuser |
| 9 | 9 | architecture,multidatabase |
| 10 | 10 | physical object,management |
| 11 | 11 | introduction,next-generation,database,technology |
| 12 | 12 | object-oriented,database systems,reality |
| 13 | 13 | introduction,technology,interoperating,legacy ... |
| 14 | 14 | resolving,schematic,multidatabase systems |
| 15 | 15 | performance benchmark,object-oriented,database... |
| 16 | 16 | object-oriented,databases |
| 17 | 17 | solution,managing,e,p,data |
| 18 | 18 | c++,object database |
| 19 | 19 | authorization,object-oriented,databases |
def preprocess_df(df):
# get English stopwords
stop_words = set(stopwords.words('english'))
stop_words.add('would')
# prepare translation table to translate punctuation to space
preprocessed_sentences = []
for i, row in df.iterrows():
sent = row["phrase"]
words_list = sent.strip().split(',')
filtered_words = [word for word in words_list if word not in stop_words and len(word) != 1] # also skip space from above translation
preprocessed_sentences.append(filtered_words)
df["phrase"] = preprocessed_sentences
return df
processed_data_ = preprocess_df(df_.copy())
tagged_data = [_d for i, _d in enumerate(processed_data_["phrase"])]
tagged_data[:10]
[['oql', 'c++', 'extending', 'c++'], ['transaction management', 'multidatabase systems'], ['overview'], ['multimedia', 'information'], ['active', 'database systems'], ['object-oriented', 'dbmss', 'early'], ['distributed', 'databases'], ['an object-oriented', 'dbms', 'war', 'story', 'developing', 'genome', 'mapping', 'database', 'c++'], ['cooperative', 'multiuser'], ['architecture', 'multidatabase']]
from gensim.models import word2vec
corpus = df_.phrase
w2v = word2vec.Word2Vec(sentences = tagged_data, min_count=1)
w2v.wv.most_similar('computer science', topn=10)
[('curriculum', 0.966228723526001),
('mathematics', 0.9508180022239685),
('undergraduate', 0.9425908327102661),
('graduate', 0.9304831624031067),
('science students', 0.9252045154571533),
('clemson', 0.9201644062995911),
('education', 0.9200882911682129),
('liberal arts college', 0.9195355176925659),
('se', 0.9188847541809082),
('baccalaureate degree', 0.9187815189361572)]
w2v.wv.most_similar('resource management', topn=10)
[('resource sharing', 0.9670305848121643),
('broker', 0.9609232544898987),
('highly dynamic', 0.9591811895370483),
('network management', 0.9565896987915039),
('in grid', 0.9546797275543213),
('workload migration', 0.9526695013046265),
('mage', 0.9520704746246338),
('computational grids', 0.9516937732696533),
('decentralised', 0.9509827494621277),
('dependable', 0.9496684074401855)]
w2v.wv.most_similar('natural language processing', topn=10)
[('nlp', 0.9904453158378601),
('lexicon', 0.9455552101135254),
('question answering', 0.9449619650840759),
('multilingual', 0.9350264072418213),
('vocabulary', 0.9333145618438721),
('information extraction', 0.9230526685714722),
('text mining', 0.9228798747062683),
('textual', 0.9207432866096497),
('domain-specific', 0.9194395542144775),
('frame elements', 0.9169366359710693)]
w2v.wv.most_similar('performance evaluation', topn=10)
[('performance analysis', 0.970462441444397),
('noc', 0.9287059307098389),
('emulation', 0.9172960519790649),
('mpich2', 0.9130640029907227),
('atm', 0.9092670679092407),
('spidergon', 0.9073224067687988),
('network-on-chip', 0.9070577621459961),
('intrusion prevention system', 0.9067220687866211),
('nios', 0.9061682224273682),
('response time', 0.9059284925460815)]
w2v.wv.most_similar('data structure', topn=10)
[('suffix tree', 0.964256227016449),
('cube', 0.9633463025093079),
('partitions', 0.9605334401130676),
('binary search', 0.9571460485458374),
('trie', 0.9560410976409912),
('suffix array', 0.9552429914474487),
('sorting', 0.9537972807884216),
('deletion', 0.9518201947212219),
('space partition', 0.9506819248199463),
('tree structure', 0.949249804019928)]
w2v.wv.most_similar('artificial intelligence', topn=10)
[('ai', 0.9871581196784973),
('expert systems', 0.9188097715377808),
('blended learning', 0.9132139086723328),
('related areas', 0.8982245922088623),
('vuelta', 0.897810161113739),
('article investigates', 0.8968126773834229),
('multidisciplinary research', 0.8900700807571411),
('thinking', 0.8880915641784668),
('interdisciplinary', 0.8866879940032959),
('emo', 0.8836549520492554)]