Curse of Dimensionality¶

Curse of Dimensionality are circumstances that emerge when dealing with data in high dimensional spaces. One of the circumstances arise during learning, specifically during feature selection. Feature selection is the process of selecting a subset of features from a data set. Intutively, as we add more features the accuracy improves. However, the accuracy would stop improving eventually as we add more features. In some cases, the accuracy would decrease.

For example, with fixed sample size, if we are trying to predict a person from his/her picture, we can start off with 2 features such as eye color, hair color. It fits our intuition that as we add more features (distance between the eyes, skin color, etc...), we can observe that the accuracy improves. However, as we include more and mor

import plotly 
plotly.tools.set_credentials_file(username='usc_eric_vader', api_key='ZH0urCkUHtrTAPURFP32')

from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
import math

names = ["Nearest Neighbors", "Linear SVM", "RBF SVM", 
         "Decision Tree", "Random Forest", "Neural Net", "AdaBoost"]
classifiers = [
    KNeighborsClassifier(3),
    SVC(kernel="linear", C=0.025),
    SVC(gamma=2, C=1),
    DecisionTreeClassifier(max_depth=5),
    RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
    MLPClassifier(alpha=1),
    AdaBoostClassifier()]

import os
import pickle
HISTORY_FILENAME_TEMPLATE = "accuracy_{}.pkl"

from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.metrics import accuracy_score
import multiprocessing as mp
import sys

def experiment(i, twenty, vectors, clf):
    twenty_train, twenty_test = twenty
    vectors_train, vectors_test = vectors
    fs = SelectKBest(chi2, k=i)
    vectors_fs = fs.fit_transform(vectors_train, twenty_train.target)
    vectors_test_fs = fs.transform(vectors_test)
    clf.fit(vectors_fs, twenty_train.target)
    predict = clf.predict(vectors_test_fs)
    return accuracy_score(twenty_test.target, predict)

history = {}

# In this loop, we will start all of the jobs
with mp.Pool(mp.cpu_count()) as pool:
    # Fetch training and test sets
    twenty_train = fetch_20newsgroups(subset='train', remove=('headers','footers','quotes'),
                                      shuffle=True, random_state=42)
    twenty_test = fetch_20newsgroups(subset='test', remove=('headers','footers','quotes'),
                                     shuffle=True, random_state=42)
    twenty = twenty_train, twenty_test

    # Use tfidf
    vectorizer = TfidfVectorizer()
    vectors_train = vectorizer.fit_transform(twenty_train.data)
    vectors_test = vectorizer.transform(twenty_test.data)
    vectors = vectors_train, vectors_test

    num_feats = vectors_train.shape[1]
    target_num_feats = set()
    for b in range(2, 11):
        target_num_feats.update([ b**i for i in range(0, int(math.log(num_feats, b))+1 ) ])
    target_num_feats = sorted(list(target_num_feats))
    
    for name, clf in zip(names, classifiers):

        history_filename = HISTORY_FILENAME_TEMPLATE.format(name)

        # If exist, then we load and just continue
        if os.path.isfile(history_filename):
            history[name] = pickle.load( open( history_filename, "rb" ) )
            continue
        else:
            print("Processing {}".format(name))

            # Feature selection with different number of features selected
            history[name] = [pool.apply_async(experiment, args=(i, twenty, vectors, clf)) for i in target_num_feats]

    # In this loop, we will collect all of the jobs
    for name, clf in zip(names, classifiers):
        
        history_filename = HISTORY_FILENAME_TEMPLATE.format(name)
        
        if os.path.isfile(history_filename):
            continue
        else:
            print("Collecting {}".format(name))

            history[name] = [h.get() for h in history[name] ]
            pickle.dump( (target_num_feats, history[name]), open( history_filename, "wb" ) )

import plotly.plotly as py
import plotly.graph_objs as go
import numpy as np

traces = []
for k,v in history.items():
    x, y = v
    traces.append(
        go.Scatter(y=y, 
                    x=np.log10(x),
                    mode="lines",
                    name=k,
                  line={'shape':'spline'})
    )

layout1 = go.Layout(xaxis=dict(
                    title="log(Number of Features)",
                    rangemode="tozero"),
                    yaxis=dict(
                    title="Test Accuracy",
                    rangemode="tozero"))
fig1 = go.Figure(data=traces, layout=layout1)
py.iplot(fig1, filename="Test Accuracy for various classifies")

/home/Eric_Vader/.conda/envs/ml/lib/python3.7/site-packages/IPython/core/display.py:689: UserWarning:

Consider using IPython.display.IFrame instead