import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
df = pd.read_csv('NY_45variables.csv')
df.head(5)
Unnamed: 0 | RECORD | r1 | r2 | r3 | r4 | r5 | r6 | r7 | r8 | ... | r9_taxclass | r1_boro | r2_boro | r3_boro | r4_boro | r5_boro | r6_boro | r7_boro | r8_boro | r9_boro | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 1 | 40.917782 | 3994.412456 | 723.985188 | 8.079350 | 788.709805 | 142.953243 | 18.413002 | 1797.485605 | ... | 4.715588 | 0.113902 | 7.712077 | 6.469193 | 0.241395 | 16.606462 | 13.685555 | 0.198288 | 14.042014 | 14.957255 |
1 | 1 | 2 | 57678.882190 | 36173.697854 | 6556.463990 | 4258.951518 | 2671.030012 | 484.122805 | 25955.496986 | 16278.164034 | ... | 42.704717 | 160.560217 | 69.841147 | 58.585497 | 127.249178 | 56.239136 | 46.347248 | 279.513149 | 127.165526 | 135.454020 |
2 | 2 | 9 | 385.182796 | 736.308714 | 14.726174 | 62.110738 | 118.729804 | 2.374596 | 173.332268 | 331.338940 | ... | 0.095917 | 1.072230 | 1.421603 | 0.131586 | 1.855748 | 2.499883 | 0.227331 | 1.866605 | 2.588430 | 0.304237 |
3 | 3 | 10 | 556.662267 | 1064.105879 | 21.282118 | 89.761892 | 171.587267 | 3.431745 | 250.498069 | 478.847739 | ... | 0.138618 | 1.549576 | 2.054487 | 0.190167 | 2.681911 | 3.612808 | 0.328536 | 2.697598 | 3.740773 | 0.439680 |
4 | 4 | 11 | 1035.027624 | 1978.540749 | 39.570815 | 94.470682 | 180.588507 | 3.611770 | 465.762431 | 890.343337 | ... | 0.257740 | 2.881198 | 3.820001 | 0.353586 | 2.822600 | 3.802332 | 0.345771 | 5.015767 | 6.955390 | 0.817518 |
5 rows × 47 columns
df_orig = df.copy()
df.drop(columns= ['Unnamed: 0','RECORD'], inplace = True)
df.head()
r1 | r2 | r3 | r4 | r5 | r6 | r7 | r8 | r9 | r1_zip5 | ... | r9_taxclass | r1_boro | r2_boro | r3_boro | r4_boro | r5_boro | r6_boro | r7_boro | r8_boro | r9_boro | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 40.917782 | 3994.412456 | 723.985188 | 8.079350 | 788.709805 | 142.953243 | 18.413002 | 1797.485605 | 325.793334 | 0.096029 | ... | 4.715588 | 0.113902 | 7.712077 | 6.469193 | 0.241395 | 16.606462 | 13.685555 | 0.198288 | 14.042014 | 14.957255 |
1 | 57678.882190 | 36173.697854 | 6556.463990 | 4258.951518 | 2671.030012 | 484.122805 | 25955.496986 | 16278.164034 | 2950.408795 | 135.364878 | ... | 42.704717 | 160.560217 | 69.841147 | 58.585497 | 127.249178 | 56.239136 | 46.347248 | 279.513149 | 127.165526 | 135.454020 |
2 | 385.182796 | 736.308714 | 14.726174 | 62.110738 | 118.729804 | 2.374596 | 173.332268 | 331.338940 | 6.626779 | 0.903974 | ... | 0.095917 | 1.072230 | 1.421603 | 0.131586 | 1.855748 | 2.499883 | 0.227331 | 1.866605 | 2.588430 | 0.304237 |
3 | 556.662267 | 1064.105879 | 21.282118 | 89.761892 | 171.587267 | 3.431745 | 250.498069 | 478.847739 | 9.576955 | 1.306414 | ... | 0.138618 | 1.549576 | 2.054487 | 0.190167 | 2.681911 | 3.612808 | 0.328536 | 2.697598 | 3.740773 | 0.439680 |
4 | 1035.027624 | 1978.540749 | 39.570815 | 94.470682 | 180.588507 | 3.611770 | 465.762431 | 890.343337 | 17.806867 | 2.429076 | ... | 0.257740 | 2.881198 | 3.820001 | 0.353586 | 2.822600 | 3.802332 | 0.345771 | 5.015767 | 6.955390 | 0.817518 |
5 rows × 45 columns
df_z = (df - df.mean())/df.std()
df_z.head()
r1 | r2 | r3 | r4 | r5 | r6 | r7 | r8 | r9 | r1_zip5 | ... | r9_taxclass | r1_boro | r2_boro | r3_boro | r4_boro | r5_boro | r6_boro | r7_boro | r8_boro | r9_boro | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | -0.324189 | 3.365634 | 0.962071 | -0.034088 | 2.023461 | 0.745320 | -0.041301 | 1.874086 | 0.937911 | -0.417857 | ... | 0.660210 | -0.374523 | 3.352458 | 2.151291 | -0.120245 | 0.910164 | 0.790755 | -0.110174 | 0.594662 | 0.887478 |
1 | 107.903135 | 34.685411 | 12.654552 | 60.713604 | 6.996480 | 2.658471 | 127.179921 | 17.407961 | 8.963675 | 62.109671 | ... | 7.410371 | 67.440601 | 34.383853 | 22.651090 | 20.011593 | 3.221530 | 2.826725 | 38.274168 | 5.752631 | 8.549314 |
2 | 0.322241 | 0.194554 | -0.459794 | 0.738056 | 0.253399 | -0.042992 | 0.718576 | 0.301305 | -0.038062 | -0.044388 | ... | -0.160643 | 0.030529 | 0.210577 | -0.341588 | 0.135643 | 0.087473 | -0.048164 | 0.119092 | 0.072426 | -0.044240 |
3 | 0.644229 | 0.513596 | -0.446651 | 1.133208 | 0.393046 | -0.037064 | 1.097074 | 0.459542 | -0.029041 | 0.141639 | ... | -0.153056 | 0.232287 | 0.526681 | -0.318545 | 0.266597 | 0.152378 | -0.041856 | 0.233289 | 0.124968 | -0.035628 |
4 | 1.542460 | 1.403606 | -0.409988 | 1.200500 | 0.416827 | -0.036055 | 2.152944 | 0.900967 | -0.003875 | 0.660585 | ... | -0.131890 | 0.795117 | 1.408496 | -0.254265 | 0.288898 | 0.163431 | -0.040781 | 0.551860 | 0.271541 | -0.011603 |
5 rows × 45 columns
pca = PCA(n_components=45, svd_solver="full")
pca.fit(df_z)
plt.plot(np.cumsum(pca.explained_variance_ratio_))
plt.xlabel('Number of components')
plt.ylabel('Cumulative explained variance')
plt.yticks(np.arange(0.05, 1.1, step=.1))
plt.xticks(np.arange(0, 45, step=2))
plt.axvline(x=6, linestyle='--')
plt.ylim(0,1)
(0.0, 1.0)
pca = PCA(n_components = 6)
princ_comps = pca.fit_transform(df_z)
df_pca = pd.DataFrame(princ_comps, columns = ['PC' + str(i) for i in range(1, pca.n_components_+1)])
df_pca.shape
(1046826, 6)
df_pca.head()
PC1 | PC2 | PC3 | PC4 | PC5 | PC6 | |
---|---|---|---|---|---|---|
0 | 7.845757 | -1.688264 | -5.706282 | -0.611724 | -2.350794 | -1.089964 |
1 | 118.463348 | 199.428585 | -55.189096 | -27.909091 | 46.844450 | -39.155704 |
2 | 0.198560 | 0.714617 | 0.347698 | -0.774629 | -0.103544 | -0.195167 |
3 | 0.798499 | 1.442086 | 0.078030 | -1.178903 | -0.153971 | -0.351040 |
4 | 2.055261 | 2.959515 | -0.913494 | -2.284578 | -0.407321 | -0.912778 |
pca_mean = df_pca.mean()
pca_sd = df_pca.std()
df_z_pca = (df_pca - pca_mean) / pca_sd
df_z_pca.describe()
PC1 | PC2 | PC3 | PC4 | PC5 | PC6 | |
---|---|---|---|---|---|---|
count | 1.046826e+06 | 1.046826e+06 | 1.046826e+06 | 1.046826e+06 | 1.046826e+06 | 1.046826e+06 |
mean | -5.461432e-16 | -8.510554e-16 | -2.538016e-16 | -1.073797e-15 | 7.200277e-16 | -5.499035e-16 |
std | 1.000000e+00 | 1.000000e+00 | 1.000000e+00 | 1.000000e+00 | 1.000000e+00 | 1.000000e+00 |
min | -2.585169e-01 | -2.280285e+02 | -1.990060e+02 | -2.023889e+02 | -2.438705e+02 | -2.002469e+02 |
25% | -1.088008e-01 | -1.544611e-01 | -1.356178e-01 | -3.269576e-02 | -1.159063e-01 | -1.101772e-01 |
50% | -2.375699e-02 | -5.919442e-02 | 6.965796e-04 | 1.366916e-02 | -1.313772e-02 | -2.535145e-02 |
75% | 3.290240e-02 | 3.225958e-02 | 3.206016e-01 | 7.018504e-02 | 4.879025e-02 | 1.382222e-01 |
max | 6.907817e+02 | 3.239819e+02 | 5.967327e+02 | 2.507926e+02 | 4.068905e+02 | 4.017033e+02 |
df_z_pca
PC1 | PC2 | PC3 | PC4 | PC5 | PC6 | |
---|---|---|---|---|---|---|
0 | 1.765189 | -0.500164 | -2.512790 | -0.387135 | -1.937130 | -0.973197 |
1 | 26.652653 | 59.082533 | -24.302792 | -17.662550 | 38.601334 | -34.960956 |
2 | 0.044673 | 0.211712 | 0.153110 | -0.490232 | -0.085324 | -0.174258 |
3 | 0.179652 | 0.427231 | 0.034361 | -0.746081 | -0.126877 | -0.313433 |
4 | 0.462406 | 0.876783 | -0.402262 | -1.445818 | -0.335646 | -0.814992 |
... | ... | ... | ... | ... | ... | ... |
1046821 | -0.051239 | -0.188997 | -0.020978 | -0.044256 | -0.222523 | -0.017497 |
1046822 | -0.066901 | -0.217381 | -0.119167 | -0.036418 | -0.335525 | -0.080359 |
1046823 | -0.022962 | -0.234285 | -0.125895 | 0.018812 | -0.288129 | -0.006414 |
1046824 | 0.034728 | -0.174836 | -0.296620 | -0.006599 | -0.344672 | -0.114188 |
1046825 | 0.009308 | -0.114933 | -0.155469 | -0.002338 | -0.179969 | -0.081994 |
1046826 rows × 6 columns
p=2
one_over_p = 1/p
score1 = (((df_z_pca).abs()**p).sum(axis=1))**one_over_p
score1[:5]
0 3.811775 1 88.408328 2 0.590113 3 0.941783 4 2.002909 dtype: float64
from keras.models import Model
from keras.layers import Input, Dense
# placeholder for input
data_in = Input(shape=(6,))
# dimension of the encoded data. Choose 3
encoded_dim = 3
# encoded representation of the data
encoded = Dense(encoded_dim, activation='tanh')(data_in)
# reconstruction of the input
decoded = Dense(6,activation='tanh')(encoded)
# this model maps the input data to the reconstructed data
autoencoder = Model(data_in, decoded)
autoencoder.compile(optimizer='adam', loss='mean_squared_error')
autoencoder.fit(df_z_pca,df_z_pca, epochs=3, shuffle=True, validation_data=(df_z_pca,df_z_pca))
pca_out = autoencoder.predict(df_z_pca)
error = pca_out - df_z_pca
p=2
oop=1/p
score2 = ((error.abs()**p).sum(axis=1))**oop
Epoch 1/3 32714/32714 [==============================] - 52s 2ms/step - loss: 1.1203 - val_loss: 0.8963 Epoch 2/3 32714/32714 [==============================] - 46s 1ms/step - loss: 0.7698 - val_loss: 0.8953 Epoch 3/3 32714/32714 [==============================] - 44s 1ms/step - loss: 0.7408 - val_loss: 0.8955
score2[:5]
0 2.175544 1 86.139650 2 0.484693 3 0.694051 4 0.985500 dtype: float64
scores = pd.DataFrame({"record": df_orig.RECORD,
"score1": score1,
"score2": score2})
scores['score1 rank'] = scores['score1'].rank()
scores['score2 rank'] = scores['score2'].rank()
scores['final'] = (scores['score1 rank'] + scores['score2 rank'])/2
scores.head()
record | score1 | score2 | score1 rank | score2 rank | final | |
---|---|---|---|---|---|---|
0 | 1 | 3.811775 | 2.175544 | 1032992.0 | 1032143.0 | 1032567.5 |
1 | 2 | 88.408328 | 86.139650 | 1046749.0 | 1046748.0 | 1046748.5 |
2 | 9 | 0.590113 | 0.484693 | 894888.0 | 980909.0 | 937898.5 |
3 | 10 | 0.941783 | 0.694051 | 978676.0 | 1002288.0 | 990482.0 |
4 | 11 | 2.002909 | 0.985500 | 1019633.0 | 1016703.0 | 1018168.0 |
origin_data = pd.read_csv('~/Desktop/MGTA463/NY property data.csv')
data_with_score = origin_data.merge(scores, left_on="RECORD", right_on="record")
data_with_score.shape
(1046826, 38)
top_records = data_with_score.sort_values(['final'], ascending = False).reset_index(drop=True).head(100).drop(columns=["record","score1","score2"])
top_records.head()
RECORD | BBLE | B | BLOCK | LOT | EASEMENT | OWNER | BLDGCL | TAXCLASS | LTFRONT | ... | AVTOT2 | EXLAND2 | EXTOT2 | EXCD2 | PERIOD | YEAR | VALTYPE | score1 rank | score2 rank | final | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 917942 | 4142600001 | 4 | 14260 | 1 | NaN | LOGAN PROPERTY, INC. | T1 | 4 | 4910 | ... | 4.501180e+09 | 1.644454e+09 | 4.501180e+09 | NaN | FINAL | 2010/11 | AC-TR | 1046826.0 | 1046826.0 | 1046826.0 |
1 | 684704 | 4036590105 | 4 | 3659 | 105 | NaN | W RUFERT | V0 | 1B | 2 | ... | NaN | NaN | NaN | NaN | FINAL | 2010/11 | AC-TR | 1046825.0 | 1046825.0 | 1046825.0 |
2 | 1065870 | 5076440001 | 5 | 7644 | 1 | NaN | PEOPLE OF THE ST OF N | V0 | 1B | 2891 | ... | NaN | NaN | NaN | NaN | FINAL | 2010/11 | AC-TR | 1046824.0 | 1046824.0 | 1046824.0 |
3 | 1059883 | 5069770012E | 5 | 6977 | 12 | E | NaN | Z7 | 4 | 5 | ... | NaN | NaN | NaN | NaN | FINAL | 2010/11 | AC-TR | 1046823.0 | 1046823.0 | 1046823.0 |
4 | 151044 | 2024930001 | 2 | 2493 | 1 | NaN | NaN | Q6 | 4 | 798 | ... | NaN | NaN | NaN | NaN | FINAL | 2010/11 | AC-TR | 1046822.0 | 1046822.0 | 1046822.0 |
5 rows × 35 columns
top_records.to_csv("top_records_100.csv", index=False)