Seems that I figured out how to get scores.
from sklearn
import decomposition, preprocessing
import numpy as np
data = np.genfromtxt('rangir_test.csv', delimiter = ',')
data = data[~np.isnan(data).any(axis = 1)]
data_normal = preprocessing.scale(data)
fa = decomposition.FactorAnalysis(n_components = 1)
fa.fit(data_normal)
for score in fa.score_samples(data_normal):
print score
Scikit-learn scores output:
-69.8587183816 - 116.353511148 - 24.1529840248 - 36.5366398005 - 7.87165586175 - 24.9012815104 - 23.9148486368 - 10.047780535 - 4.03376369723 - 7.07428842783 - 7.44222705099 - 6.25705487929 - 13.2313513762 - 13.3253819521 - 9.23993173528 - 7.141616656 - 5.57915693405 - 6.82400483045 - 15.0906961724 - 3.37447211233 - 5.41032267015 - 5.75224753811 - 19.7230390792 - 6.75268922909 - 4.04911793705 - 10.6062761691 - 3.17417070498 - 9.95916350005 - 3.25893428094 - 3.88566777358 - 3.30908856716 - 3.58141292341 - 3.90778368669 - 4.01462493538 - 11.6683969455 - 5.30068548445 - 24.3400870389 - 7.66035331181 - 13.8321672858 - 8.93461397086 - 17.4068326999
For everyone used to R factanal there is a python package available that wraps the R factanal function so that you can just call it from python with a pandas data frame like this:
from factanal.wrapper
import factanal
fa_res = factanal(pdf, factors = 4, scores = 'regression', rotation = 'promax',
verbose = True, return_dict = True)
Install with:
pip install factanal
Seems that I figured out how to get scores.,Unfortunately the output (see below) is very different to one from factanal(). Any advises on decomposition.FactorAnalysis() will be appreciated.,For everyone used to R factanal there is a python package available that wraps the R factanal function so that you can just call it from python with a pandas data frame like this:,This is late but may be still interesting for OP or others who came here from google.
Seems that I figured out how to get scores.
from sklearn
import decomposition, preprocessing
import numpy as np
data = np.genfromtxt('rangir_test.csv', delimiter = ',')
data = data[~np.isnan(data).any(axis = 1)]
data_normal = preprocessing.scale(data)
fa = decomposition.FactorAnalysis(n_components = 1)
fa.fit(data_normal)
for score in fa.score_samples(data_normal):
print score
Scikit-learn scores output:
-69.8587183816 - 116.353511148 - 24.1529840248 - 36.5366398005 - 7.87165586175 - 24.9012815104 - 23.9148486368 - 10.047780535 - 4.03376369723 - 7.07428842783 - 7.44222705099 - 6.25705487929 - 13.2313513762 - 13.3253819521 - 9.23993173528 - 7.141616656 - 5.57915693405 - 6.82400483045 - 15.0906961724 - 3.37447211233 - 5.41032267015 - 5.75224753811 - 19.7230390792 - 6.75268922909 - 4.04911793705 - 10.6062761691 - 3.17417070498 - 9.95916350005 - 3.25893428094 - 3.88566777358 - 3.30908856716 - 3.58141292341 - 3.90778368669 - 4.01462493538 - 11.6683969455 - 5.30068548445 - 24.3400870389 - 7.66035331181 - 13.8321672858 - 8.93461397086 - 17.4068326999
For everyone used to R factanal there is a python package available that wraps the R factanal function so that you can just call it from python with a pandas data frame like this:
from factanal.wrapper
import factanal
fa_res = factanal(pdf, factors = 4, scores = 'regression', rotation = 'promax',
verbose = True, return_dict = True)
Updated: November 7, 2021
from sklearn.decomposition import PCA from sklearn.preprocessing import StandardScaler from bioinfokit.analys import get_data import numpy as np import pandas as pd # load dataset as pandas dataframe df = get_data('gexp').data df.head(2) # output A B C D E F 0 4.50570 3.26036 - 1.24940 8.89807 8.05955 - 0.842803 1 3.50856 1.66079 - 1.85668 - 2.57336 - 1.37370 1.196000 # variables A to F denotes multiple conditions associated with fungal stress # Read full paper https: //journals.plos.org/plosone/article?id=10.1371/journal.pone.0138025
# this is an optional step df_st = StandardScaler().fit_transform(df) pd.DataFrame(df_st, columns = df.columns).head(2) # output A B C D E F 0 0.619654 0.448280 - 0.240867 2.457058 2.304732 - 0.331489 1 0.342286 - 0.041499 - 0.428652 - 1.214732 - 0.877151 0.474930
pca_out = PCA().fit(df_st) # get the component variance # Proportion of Variance(from PC1 to PC6) pca_out.explained_variance_ratio_ # output array([0.2978742, 0.27481252, 0.23181442, 0.19291638, 0.00144353, 0.00113895 ]) # Cumulative proportion of variance(from PC1 to PC6) np.cumsum(pca_out.explained_variance_ratio_) # output array([0.2978742, 0.57268672, 0.80450114, 0.99741752, 0.99886105, 1. ]) # component loadings or weights(correlation coefficient between original variables and the component) # component loadings represents the elements of the eigenvector # the squared loadings within the PCs always sums to 1 loadings = pca_out.components_ num_pc = pca_out.n_features_ pc_list = ["PC" + str(i) for i in list(range(1, num_pc + 1))] loadings_df = pd.DataFrame.from_dict(dict(zip(pc_list, loadings))) loadings_df['variable'] = df.columns.values loadings_df = loadings_df.set_index('variable') loadings_df # output PC1 PC2 PC3 PC4 PC5 PC6 variable A - 0.510898 0.452234 0.227356 - 0.323464 0.614881 0.008372 B - 0.085908 0.401197 0.708556 0.132788 - 0.558448 - 0.010616 C 0.477477 - 0.100994 0.462437 0.487951 0.556605 0.007893 D 0.370318 0.611485 - 0.308295 0.054973 - 0.007642 0.625159 E 0.568491 0.300118 - 0.011775 - 0.484115 0.009382 - 0.593425 F 0.208090 - 0.400426 0.370440 - 0.634234 - 0.010111 0.506732 # positive and negative values in component loadings reflects the positive and negative # correlation of the variables with the PCs.Except A and B, all other variables have # positive projection on first PC. # get correlation matrix plot for loadings import seaborn as sns import matplotlib.pyplot as plt ax = sns.heatmap(loadings_df, annot = True, cmap = 'Spectral') plt.show()
# get eigenvalues(variance explained by each PC) pca_out.explained_variance_ # output array([1.78994905, 1.65136965, 1.39299071, 1.15924943, 0.0086743, 0.00684401 ]) # get scree plot( for scree or elbow test) from bioinfokit.visuz import cluster cluster.screeplot(obj = [pc_list, pca_out.explained_variance_ratio_]) # Scree plot will be saved in the same directory with name screeplot.png
# get PCA loadings plots(2 D and 3 D) # 2 D cluster.pcaplot(x = loadings[0], y = loadings[1], labels = df.columns.values, var1 = round(pca_out.explained_variance_ratio_[0] * 100, 2), var2 = round(pca_out.explained_variance_ratio_[1] * 100, 2)) # 3 D cluster.pcaplot(x = loadings[0], y = loadings[1], z = loadings[2], labels = df.columns.values, var1 = round(pca_out.explained_variance_ratio_[0] * 100, 2), var2 = round(pca_out.explained_variance_ratio_[1] * 100, 2), var3 = round(pca_out.explained_variance_ratio_[2] * 100, 2))
# get PC scores pca_scores = PCA().fit_transform(df_st) # get 2 D biplot cluster.biplot(cscore = pca_scores, loadings = loadings, labels = df.columns.values, var1 = round(pca_out.explained_variance_ratio_[0] * 100, 2), var2 = round(pca_out.explained_variance_ratio_[1] * 100, 2)) # get 3 D biplot cluster.biplot(cscore = pca_scores, loadings = loadings, labels = df.columns.values, var1 = round(pca_out.explained_variance_ratio_[0] * 100, 2), var2 = round(pca_out.explained_variance_ratio_[1] * 100, 2), var3 = round(pca_out.explained_variance_ratio_[2] * 100, 2))