Curse of Dimensionality are circumstances that emerge when dealing with data in high dimensional spaces. One of the circumstances arise during learning, specifically during feature selection. Feature selection is the process of selecting a subset of features from a data set. Intutively, as we add more features the accuracy improves. However, the accuracy would stop improving eventually as we add more features. In some cases, the accuracy would decrease.
For example, with fixed sample size, if we are trying to predict a person from his/her picture, we can start off with 2 features such as eye color, hair color. It fits our intuition that as we add more features (distance between the eyes, skin color, etc...), we can observe that the accuracy improves. However, as we include more and mor
import plotly
plotly.tools.set_credentials_file(username='usc_eric_vader', api_key='ZH0urCkUHtrTAPURFP32')
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
import math
names = ["Nearest Neighbors", "Linear SVM", "RBF SVM",
"Decision Tree", "Random Forest", "Neural Net", "AdaBoost"]
classifiers = [
KNeighborsClassifier(3),
SVC(kernel="linear", C=0.025),
SVC(gamma=2, C=1),
DecisionTreeClassifier(max_depth=5),
RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
MLPClassifier(alpha=1),
AdaBoostClassifier()]
import os
import pickle
HISTORY_FILENAME_TEMPLATE = "accuracy_{}.pkl"
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.metrics import accuracy_score
import multiprocessing as mp
import sys
def experiment(i, twenty, vectors, clf):
twenty_train, twenty_test = twenty
vectors_train, vectors_test = vectors
fs = SelectKBest(chi2, k=i)
vectors_fs = fs.fit_transform(vectors_train, twenty_train.target)
vectors_test_fs = fs.transform(vectors_test)
clf.fit(vectors_fs, twenty_train.target)
predict = clf.predict(vectors_test_fs)
return accuracy_score(twenty_test.target, predict)
history = {}
# In this loop, we will start all of the jobs
with mp.Pool(mp.cpu_count()) as pool:
# Fetch training and test sets
twenty_train = fetch_20newsgroups(subset='train', remove=('headers','footers','quotes'),
shuffle=True, random_state=42)
twenty_test = fetch_20newsgroups(subset='test', remove=('headers','footers','quotes'),
shuffle=True, random_state=42)
twenty = twenty_train, twenty_test
# Use tfidf
vectorizer = TfidfVectorizer()
vectors_train = vectorizer.fit_transform(twenty_train.data)
vectors_test = vectorizer.transform(twenty_test.data)
vectors = vectors_train, vectors_test
num_feats = vectors_train.shape[1]
target_num_feats = set()
for b in range(2, 11):
target_num_feats.update([ b**i for i in range(0, int(math.log(num_feats, b))+1 ) ])
target_num_feats = sorted(list(target_num_feats))
for name, clf in zip(names, classifiers):
history_filename = HISTORY_FILENAME_TEMPLATE.format(name)
# If exist, then we load and just continue
if os.path.isfile(history_filename):
history[name] = pickle.load( open( history_filename, "rb" ) )
continue
else:
print("Processing {}".format(name))
# Feature selection with different number of features selected
history[name] = [pool.apply_async(experiment, args=(i, twenty, vectors, clf)) for i in target_num_feats]
# In this loop, we will collect all of the jobs
for name, clf in zip(names, classifiers):
history_filename = HISTORY_FILENAME_TEMPLATE.format(name)
if os.path.isfile(history_filename):
continue
else:
print("Collecting {}".format(name))
history[name] = [h.get() for h in history[name] ]
pickle.dump( (target_num_feats, history[name]), open( history_filename, "wb" ) )
import plotly.plotly as py
import plotly.graph_objs as go
import numpy as np
traces = []
for k,v in history.items():
x, y = v
traces.append(
go.Scatter(y=y,
x=np.log10(x),
mode="lines",
name=k,
line={'shape':'spline'})
)
layout1 = go.Layout(xaxis=dict(
title="log(Number of Features)",
rangemode="tozero"),
yaxis=dict(
title="Test Accuracy",
rangemode="tozero"))
fig1 = go.Figure(data=traces, layout=layout1)
py.iplot(fig1, filename="Test Accuracy for various classifies")