In [1]:
import os
import sys
import string
sys.path.append(os.path.pardir)
%matplotlib inline
import numpy as np
import pandas as pd
from fa_kit import FactorAnalysis
from fa_kit import plotting as fa_plotting
This function will generate samples of fake data and store it in the n_samples-by-n_features matrix data, just like we did in Episode 0. But then we do a couple of new things:
We scale the first 10 feature by 2, resulting in unequal signal variance across features but still keeping the same correlation stucture
Place the data into a pandas DataFrame and name each of the columns with a string. We will pass this dataframe directly into the facotr analysis, and the output will have each of the features properly labelled.
In [2]:
def make_random_data(n_samp=10000, n_feat=100):
"""
make some random data with correlated features
"""
data = np.random.randn(n_samp, n_feat)
signal_width = 10
signal_overlap = 2
step_size = signal_width - signal_overlap
for i in range(0, data.shape[1], step_size):
shared_signal = 0.3*np.random.randn(n_samp, 1)
data[:, i:(i+signal_width)] += shared_signal
return data
data = make_random_data()
data[:, :10] *= 2
column_names = [
"{}{}".format(i, string.lowercase[i % len(string.lowercase)])
for i in range(data.shape[1])]
df_data = pd.DataFrame(
data,
columns=column_names
)
The function run_pipeline will take the DataFrame of data as input and do a factor analysis. We now have two input arguments that control how the data will be preprocessed:
preproc_demean will determine if each dimension should be de-meanedprepoc_scale will determine is each dimension should be scaled to unit standard deviationSetting both to True will result in components a correlation matrix for extracting features. If preproc_demean=True and preproc_demean=False, you'll get a covariance matrix.
These same preprocessing steps will be applied to any other data that you feed into the FacorAnalysis object using the method get_component_scores.
If you want to come up with your own crazy num_dimension-by-nu_dimension feature association matrix, go ahead. Calculate it in python ahead of time, and pass it into the FactorAnalysis object using the method load_data_cov (rather than load_data_samples).
In [3]:
def run_pipeline(data, preproc_demean=True, preproc_scale=True, **kwargs):
retain_method='broken_stick'
rotation_method='varimax'
# Set up the factor analysis object, indiate how to calculate the
# association matrix out of this input data.
fa = FactorAnalysis.load_data_samples(
data,
preproc_demean=preproc_demean,
preproc_scale=preproc_scale,
**kwargs
)
# Extract the components
fa.extract_components()
# Calculate how many components to retain
# You can use any of these methods:
# 'top_n', 'top_pct', 'kaiser', 'broken_stick'
fa.find_comps_to_retain(
method=retain_method,
**kwargs
)
# Once you know how many to retain, re-extract with PAF
fa.reextract_using_paf()
# Apply factor rotation
# Right now there are both 'varimax' and 'quartimax'
fa.rotate_components(
method=rotation_method
)
return fa
In [4]:
fa_unscaled = run_pipeline(
df_data,
retain_method='broken_stick',
preproc_demean=False,
preproc_scale=False
)
fig = fa_plotting.graph_summary(fa_unscaled)
In [5]:
fa_scaled = run_pipeline(
df_data,
retain_method='broken_stick',
preproc_demean=False,
preproc_scale=True
)
fig = fa_plotting.graph_summary(fa_scaled)
In [6]:
fa_plotting.text_summary(fa_scaled, top_n_items=4)