how to calculate factoranalysis scores using python (scikit-learn)?

  • Last Update :
  • Techknowledgy :

Seems that I figured out how to get scores.

from sklearn
import decomposition, preprocessing
import numpy as np

data = np.genfromtxt('rangir_test.csv', delimiter = ',')
data = data[~np.isnan(data).any(axis = 1)]
data_normal = preprocessing.scale(data)
fa = decomposition.FactorAnalysis(n_components = 1)
fa.fit(data_normal)
for score in fa.score_samples(data_normal):
   print score

Scikit-learn scores output:

-69.8587183816
   -
   116.353511148 -
   24.1529840248 -
   36.5366398005 -
   7.87165586175 -
   24.9012815104 -
   23.9148486368 -
   10.047780535 -
   4.03376369723 -
   7.07428842783 -
   7.44222705099 -
   6.25705487929 -
   13.2313513762 -
   13.3253819521 -
   9.23993173528 -
   7.141616656 -
   5.57915693405 -
   6.82400483045 -
   15.0906961724 -
   3.37447211233 -
   5.41032267015 -
   5.75224753811 -
   19.7230390792 -
   6.75268922909 -
   4.04911793705 -
   10.6062761691 -
   3.17417070498 -
   9.95916350005 -
   3.25893428094 -
   3.88566777358 -
   3.30908856716 -
   3.58141292341 -
   3.90778368669 -
   4.01462493538 -
   11.6683969455 -
   5.30068548445 -
   24.3400870389 -
   7.66035331181 -
   13.8321672858 -
   8.93461397086 -
   17.4068326999

For everyone used to R factanal there is a python package available that wraps the R factanal function so that you can just call it from python with a pandas data frame like this:

from factanal.wrapper
import factanal

fa_res = factanal(pdf, factors = 4, scores = 'regression', rotation = 'promax',
   verbose = True, return_dict = True)

Install with:

pip install factanal

Suggestion : 2

Seems that I figured out how to get scores.,Unfortunately the output (see below) is very different to one from factanal(). Any advises on decomposition.FactorAnalysis() will be appreciated.,For everyone used to R factanal there is a python package available that wraps the R factanal function so that you can just call it from python with a pandas data frame like this:,This is late but may be still interesting for OP or others who came here from google.

Seems that I figured out how to get scores.

from sklearn
import decomposition, preprocessing
import numpy as np

data = np.genfromtxt('rangir_test.csv', delimiter = ',')
data = data[~np.isnan(data).any(axis = 1)]
data_normal = preprocessing.scale(data)
fa = decomposition.FactorAnalysis(n_components = 1)
fa.fit(data_normal)
for score in fa.score_samples(data_normal):
   print score

Scikit-learn scores output:

-69.8587183816
   -
   116.353511148 -
   24.1529840248 -
   36.5366398005 -
   7.87165586175 -
   24.9012815104 -
   23.9148486368 -
   10.047780535 -
   4.03376369723 -
   7.07428842783 -
   7.44222705099 -
   6.25705487929 -
   13.2313513762 -
   13.3253819521 -
   9.23993173528 -
   7.141616656 -
   5.57915693405 -
   6.82400483045 -
   15.0906961724 -
   3.37447211233 -
   5.41032267015 -
   5.75224753811 -
   19.7230390792 -
   6.75268922909 -
   4.04911793705 -
   10.6062761691 -
   3.17417070498 -
   9.95916350005 -
   3.25893428094 -
   3.88566777358 -
   3.30908856716 -
   3.58141292341 -
   3.90778368669 -
   4.01462493538 -
   11.6683969455 -
   5.30068548445 -
   24.3400870389 -
   7.66035331181 -
   13.8321672858 -
   8.93461397086 -
   17.4068326999

For everyone used to R factanal there is a python package available that wraps the R factanal function so that you can just call it from python with a pandas data frame like this:

from factanal.wrapper
import factanal

fa_res = factanal(pdf, factors = 4, scores = 'regression', rotation = 'promax',
   verbose = True, return_dict = True)

Suggestion : 3

Updated: November 7, 2021

from sklearn.decomposition
import PCA
from sklearn.preprocessing
import StandardScaler
from bioinfokit.analys
import get_data
import numpy as np
import pandas as pd
# load dataset as pandas dataframe
df = get_data('gexp').data
df.head(2)
# output
A B C D E F
0 4.50570 3.26036 - 1.24940 8.89807 8.05955 - 0.842803
1 3.50856 1.66079 - 1.85668 - 2.57336 - 1.37370 1.196000
# variables A to F denotes multiple conditions associated with fungal stress
# Read full paper https: //journals.plos.org/plosone/article?id=10.1371/journal.pone.0138025
# this is an optional step
df_st = StandardScaler().fit_transform(df)
pd.DataFrame(df_st, columns = df.columns).head(2)
# output
A B C D E F
0 0.619654 0.448280 - 0.240867 2.457058 2.304732 - 0.331489
1 0.342286 - 0.041499 - 0.428652 - 1.214732 - 0.877151 0.474930
pca_out = PCA().fit(df_st)

# get the component variance
# Proportion of Variance(from PC1 to PC6)
pca_out.explained_variance_ratio_
# output
array([0.2978742, 0.27481252, 0.23181442, 0.19291638, 0.00144353,
   0.00113895
])

# Cumulative proportion of variance(from PC1 to PC6)
np.cumsum(pca_out.explained_variance_ratio_)
# output
array([0.2978742, 0.57268672, 0.80450114, 0.99741752, 0.99886105,
   1.
])

# component loadings or weights(correlation coefficient between original variables and the component)
# component loadings represents the elements of the eigenvector
# the squared loadings within the PCs always sums to 1
loadings = pca_out.components_
num_pc = pca_out.n_features_
pc_list = ["PC" + str(i) for i in list(range(1, num_pc + 1))]
loadings_df = pd.DataFrame.from_dict(dict(zip(pc_list, loadings)))
loadings_df['variable'] = df.columns.values
loadings_df = loadings_df.set_index('variable')
loadings_df
# output
PC1 PC2 PC3 PC4 PC5 PC6
variable
A - 0.510898 0.452234 0.227356 - 0.323464 0.614881 0.008372
B - 0.085908 0.401197 0.708556 0.132788 - 0.558448 - 0.010616
C 0.477477 - 0.100994 0.462437 0.487951 0.556605 0.007893
D 0.370318 0.611485 - 0.308295 0.054973 - 0.007642 0.625159
E 0.568491 0.300118 - 0.011775 - 0.484115 0.009382 - 0.593425
F 0.208090 - 0.400426 0.370440 - 0.634234 - 0.010111 0.506732

# positive and negative values in component loadings reflects the positive and negative
# correlation of the variables with the PCs.Except A and B, all other variables have
# positive projection on first PC.

# get correlation matrix plot
for loadings
import seaborn as sns
import matplotlib.pyplot as plt
ax = sns.heatmap(loadings_df, annot = True, cmap = 'Spectral')
plt.show()
# get eigenvalues(variance explained by each PC)
pca_out.explained_variance_
# output
array([1.78994905, 1.65136965, 1.39299071, 1.15924943, 0.0086743,
   0.00684401
])
# get scree plot(
   for scree or elbow test)
from bioinfokit.visuz
import cluster
cluster.screeplot(obj = [pc_list, pca_out.explained_variance_ratio_])
# Scree plot will be saved in the same directory with name screeplot.png
# get PCA loadings plots(2 D and 3 D)
# 2 D
cluster.pcaplot(x = loadings[0], y = loadings[1], labels = df.columns.values,
   var1 = round(pca_out.explained_variance_ratio_[0] * 100, 2),
   var2 = round(pca_out.explained_variance_ratio_[1] * 100, 2))

# 3 D
cluster.pcaplot(x = loadings[0], y = loadings[1], z = loadings[2], labels = df.columns.values,
   var1 = round(pca_out.explained_variance_ratio_[0] * 100, 2), var2 = round(pca_out.explained_variance_ratio_[1] * 100, 2),
   var3 = round(pca_out.explained_variance_ratio_[2] * 100, 2))
# get PC scores
pca_scores = PCA().fit_transform(df_st)

# get 2 D biplot
cluster.biplot(cscore = pca_scores, loadings = loadings, labels = df.columns.values, var1 = round(pca_out.explained_variance_ratio_[0] * 100, 2),
   var2 = round(pca_out.explained_variance_ratio_[1] * 100, 2))

# get 3 D biplot
cluster.biplot(cscore = pca_scores, loadings = loadings, labels = df.columns.values,
   var1 = round(pca_out.explained_variance_ratio_[0] * 100, 2), var2 = round(pca_out.explained_variance_ratio_[1] * 100, 2),
   var3 = round(pca_out.explained_variance_ratio_[2] * 100, 2))