Main

RESULTS

In [1]:
#0.823 onehot + sd NGnDeQZt
#0.829 onehot + sd + quantile + drop
#0.844  sd + specialized_drop
In [2]:
# 0.78 xgboost
# test = pd.read_csv("songs_to_classify.csv")
# test = test.drop(["key"], axis=1)
# test = test.drop(["time_signature"], axis=1)
# test = test.drop(["duration"], axis=1) # dålig +1.5
# test= test.drop(["mode"], axis=1) # +1
# test = test.drop(["liveness"], axis=1) # -0.2
# test = test.drop(["tempo"], axis=1) # +1
# scale = StandardScaler()
# names = test.columns
# processed_test_data = pd.DataFrame(scale.fit_transform(test), columns=names)

IMPORTS

In [3]:
import pandas as pd
import numpy as np
from matplotlib import style, pyplot as plt
import xgboost


from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.linear_model import LogisticRegression as LR
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis as QDA
from sklearn.neighbors import KNeighborsClassifier as KNN
from sklearn.model_selection import StratifiedKFold, ShuffleSplit
from sklearn.ensemble import RandomForestClassifier as RF
from sklearn.svm import SVC

from sklearn.preprocessing import QuantileTransformer
import seaborn as sns
from itertools import combinations
from mpl_toolkits import mplot3d

PREPROCESSING

In [4]:
test = pd.read_csv("songs_to_classify.csv")
df = pd.read_csv("training_data.csv")

#df["liveness"] = df["liveness"].apply(np.exp)
#df = df.drop(["key"], axis=1)
#df = df.drop(["time_signature"], axis=1)
#df = df.drop(["instrumentalness"], axis=1)
#df = df.drop(["energy"], axis=1) # lite viktig -0.5
#df = df.drop(["duration"], axis=1) # dålig +1.5
#df = df.drop(["mode"], axis=1) # +1
#df = df.drop(["liveness"], axis=1) # -0.2
#df = df.drop(["loudness"], axis=1) # -2.0
#df = df.drop(["acousticness"], axis=1) # -3.3
#df = df.drop(["speechiness"], axis=1) # -7.7
#df = df.drop(["danceability"], axis=1) # -11
#df = df.drop(["tempo"], axis=1) # +1
#df = df.drop(["valence"], axis=1) # -1

scale = StandardScaler()
# names = df.columns
# qt = QuantileTransformer(output_distribution='normal')
# label = df["label"]
# df = pd.DataFrame(qt.fit_transform(df.to_numpy().reshape(-1, df.shape[1])), columns=names)
# df["label"] = label

X = df.drop("label", axis=1)
names = X.columns
X = pd.DataFrame(scale.fit_transform(X), columns=names)

#dummies = pd.get_dummies(X["time_signature"], prefix="signature" ,drop_first=True)
#X = pd.concat([X,dummies],axis=1).drop("time_signature", axis=1)

#dummies = pd.get_dummies(X["key"], prefix="key" ,drop_first=True)
#X = pd.concat([X,dummies],axis=1).drop("key", axis=1)

y = df["label"]
df.head()
C:\Users\Nilsl\Anaconda3\lib\site-packages\sklearn\preprocessing\data.py:645: DataConversionWarning: Data with input dtype int64, float64 were all converted to float64 by StandardScaler.
  return self.partial_fit(X, y)
C:\Users\Nilsl\Anaconda3\lib\site-packages\sklearn\base.py:464: DataConversionWarning: Data with input dtype int64, float64 were all converted to float64 by StandardScaler.
  return self.fit(X, **fit_params).transform(X)
Out[4]:
acousticness danceability duration energy instrumentalness key liveness loudness mode speechiness tempo time_signature valence label
0 0.713 0.514 100125 0.521 0.816000 8 0.1120 -14.835 0 0.0444 119.879 4 0.143 1
1 0.192 0.714 207019 0.614 0.000000 4 0.2630 -6.935 1 0.0319 123.969 4 0.582 1
2 0.333 0.630 216200 0.455 0.000004 5 0.1270 -9.290 1 0.0292 139.931 4 0.199 1
3 0.601 0.810 136413 0.221 0.210000 5 0.1840 -11.005 1 0.0429 109.960 4 0.798 1
4 0.883 0.465 181440 0.459 0.000173 6 0.0692 -8.137 0 0.0351 90.807 4 0.288 1
In [5]:
# test = pd.read_csv("songs_to_classify.csv")
# scale = StandardScaler()
# X = test
# names = X.columns
# X = pd.DataFrame(scale.fit_transform(X), columns=names)
# X.head()
# Competition(model, X)
In [18]:
for column in X.columns:
    plt.figure(figsize=(12,8))
    sns.distplot(X[column], kde=False)
    plt.title(column)
    plt.show()

FUNCTIONS

In [7]:
#Final
#Takes the trained model and the processed_test_data and converts it into the correct competition format
def Competition(model, processed_test_data):
    y_pred = model.predict(processed_test_data)
    return ( str(y_pred)[1:-1].replace(" ","").replace("\n","") )
In [8]:
# K-crossfold Validation
# More variance less bias
def CV_score(Untrained_model, X, y, folds=5, all_scores=False, xgb=False):
    skf = StratifiedKFold(n_splits=folds, random_state=1, shuffle=True)
    result = 0
    for train_index, test_index in skf.split(X, y):
        #print("TRAIN:", train_index, "TEST:", test_index)
        X_train, X_test = X.iloc[train_index,:], X.iloc[test_index,:]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        if xgb: 
            model.fit(X_train, y_train, early_stopping_rounds=5, eval_set=[(X_test, y_test)], verbose=False)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        result = result + np.sum(y_test == y_pred) / y_test.shape[0]
    return result/folds
    
In [9]:
# Montecarlo Validation
# More bias less variance
def monte_carlo(Untrained_model, X, y, folds=5, all_scores=False, xgb=False):
    monte = ShuffleSplit(n_splits=folds, test_size=.25, random_state=0)
    result = 0
    for train_index, test_index in monte.split(X, y):
        #print("TRAIN:", train_index, "TEST:", test_index)
        X_train, X_test = X.iloc[train_index,:], X.iloc[test_index,:]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        if xgb:
            model.fit(X_train, y_train, early_stopping_rounds=5, eval_set=[(X_test, y_test)], verbose=False)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        result = result + np.sum(y_test == y_pred) / y_test.shape[0]
    return result/folds

MODELS

In [10]:
# Logistic Regression
model = LR(penalty="l2", solver="lbfgs")
CV_score(model, X, y)
Out[10]:
0.8120463428004208
In [11]:
model = SVC()
CV_score(model, X, y)
C:\Users\Nilsl\Anaconda3\lib\site-packages\sklearn\svm\base.py:196: FutureWarning: The default value of gamma will change from 'auto' to 'scale' in version 0.22 to account better for unscaled features. Set gamma explicitly to 'auto' or 'scale' to avoid this warning.
  "avoid this warning.", FutureWarning)
C:\Users\Nilsl\Anaconda3\lib\site-packages\sklearn\svm\base.py:196: FutureWarning: The default value of gamma will change from 'auto' to 'scale' in version 0.22 to account better for unscaled features. Set gamma explicitly to 'auto' or 'scale' to avoid this warning.
  "avoid this warning.", FutureWarning)
C:\Users\Nilsl\Anaconda3\lib\site-packages\sklearn\svm\base.py:196: FutureWarning: The default value of gamma will change from 'auto' to 'scale' in version 0.22 to account better for unscaled features. Set gamma explicitly to 'auto' or 'scale' to avoid this warning.
  "avoid this warning.", FutureWarning)
C:\Users\Nilsl\Anaconda3\lib\site-packages\sklearn\svm\base.py:196: FutureWarning: The default value of gamma will change from 'auto' to 'scale' in version 0.22 to account better for unscaled features. Set gamma explicitly to 'auto' or 'scale' to avoid this warning.
  "avoid this warning.", FutureWarning)
C:\Users\Nilsl\Anaconda3\lib\site-packages\sklearn\svm\base.py:196: FutureWarning: The default value of gamma will change from 'auto' to 'scale' in version 0.22 to account better for unscaled features. Set gamma explicitly to 'auto' or 'scale' to avoid this warning.
  "avoid this warning.", FutureWarning)
Out[11]:
0.8012727676785636
In [12]:
# RandomForest
model = RF(
    n_estimators='warn',
    criterion='gini',
    max_depth=None,
    min_samples_split=2,
    min_samples_leaf=1,
    min_weight_fraction_leaf=0.0,
    max_features='auto',
    max_leaf_nodes=None,
    min_impurity_decrease=0.0,
    min_impurity_split=None,
    bootstrap=True,
    oob_score=False,
    n_jobs=None,
    random_state=1,
    verbose=0,
    warm_start=False,
    class_weight=None,
)
CV_score(model, X, y, folds=5)
C:\Users\Nilsl\Anaconda3\lib\site-packages\sklearn\ensemble\forest.py:246: FutureWarning: The default value of n_estimators will change from 10 in version 0.20 to 100 in 0.22.
  "10 in version 0.20 to 100 in 0.22.", FutureWarning)
Out[12]:
0.8266338948397707
In [13]:
# XGboost
model = xgboost.XGBClassifier(
    max_depth=3,
    learning_rate=0.1,
    n_estimators=10,
    objective='binary:logistic',
    booster='gbtree',
    gamma=0,
    min_child_weight=1,
    max_delta_step=0,
    subsample=1,
    colsample_bytree=1,
    colsample_bylevel=1,
    colsample_bynode=1,
    reg_alpha=0,
    reg_lambda=1,
    scale_pos_weight=1,
    base_score=0.5,
    random_state=14
)

print(CV_score(model, X, y, folds=6, xgb=True))
#monte_carlo(model, X, y, folds=6)
0.8185951527564431

Data Visualisations

In [14]:
names = ['acousticness', 'danceability', 'duration', 'energy',
       'instrumentalness', 'key', 'liveness', 'loudness', 'mode',
       'speechiness', 'tempo', 'time_signature', 'valence']

y = pd.read_csv("training_data.csv")["label"]
contdf = pd.read_csv("training_data.csv").drop("label", axis=1)

def contour_2D(model,contdf, y, columns=("danceability","speechiness")):
    x_axis, y_axis = columns
    X = pd.concat([contdf[x_axis],contdf[y_axis]], axis=1)
    X_np, y_np = X.to_numpy(), y.to_numpy()
    model.fit(X_np, y_np)

    axes = [X.min()[0]-0.1, X.max()[0]+0.1, X.min()[1]-0.1, X.max()[1]+0.1]

    x0, x1 = np.meshgrid(
            np.linspace(axes[0], axes[1], 10).reshape(-1, 1),
            np.linspace(axes[2], axes[3], 10).reshape(-1, 1),
        )
    X_new = np.c_[x0.ravel(), x1.ravel()]
    y_predict_np = model.predict(X_new)
    #y_predict = pd.DataFrame(y_predict_np)
    zz = y_predict_np.reshape(x0.shape)

    plt.figure(figsize=(8, 8))
    plt.plot(X_np[y==0, 0], X_np[y==0, 1], "bs", label="dislike")
    plt.plot(X_np[y==1, 0], X_np[y==1, 1], "yo", label="like")

    from matplotlib.colors import ListedColormap
    custom_cmap = ListedColormap(['#9898ff', '#fafab0'])

    plt.contourf(x0, x1, zz)
    plt.xlabel(x_axis, fontsize=14)
    plt.ylabel(y_axis, fontsize=14)
    plt.title(str(CV_score(model, X, y, folds=5)))
    plt.legend(loc="lower right", fontsize=14)
    plt.axis(axes)
In [15]:
import warnings
warnings.filterwarnings("ignore")
for i,a in enumerate(combinations(names, 2)):
    if (i<2):
        contour_2D(model,contdf, y, columns=a)
In [16]:
names = ['acousticness', 'danceability', 'duration', 'energy',
       'instrumentalness', 'key', 'liveness', 'loudness', 'mode',
       'speechiness', 'tempo', 'time_signature', 'valence']

y = pd.read_csv("training_data.csv")["label"]
contdf = pd.read_csv("training_data.csv").drop("label", axis=1)

def scatter_3D(model,contdf, y, columns=("danceability","speechiness", "energy")):
    ax = plt.axes(projection="3d")
    x_axis, y_axis, z_axis = columns
    X = pd.concat([contdf[x_axis], contdf[y_axis], contdf[z_axis]], axis=1).to_numpy()

    plt.figure(figsize=(10, 4))
    ax.plot3D(X[y==0, 0], X[y==0, 1], X[y==0, 2], "bs", label="dislike")
    ax.plot3D(X[y==1, 0], X[y==1, 1], X[y==1, 2], "yo", label="like")
    ax.set_xlabel(x_axis)
    ax.set_ylabel(y_axis)
    ax.set_zlabel(z_axis)
In [17]:
#import warnings
#warnings.filterwarnings("ignore")
for i,a in enumerate(combinations(names, 3)):
    if (i<2):
        scatter_3D(model,contdf, y, columns=a)
<Figure size 720x288 with 0 Axes>