how does sklearn cross_val_score use kfold?

  • Last Update :
  • Techknowledgy :

Try this

kf = KFold(n_splits = 2)
generator = kf.split(dataset)
for train, test in generator:
   print "*" * 20
print "Training Data:"
for i in train:
   print dataset[i]
print "Test Data:"
for j in test:
   print dataset[j]

Suggestion : 2

The dataset is split into ‘k’ number of subsets, k-1 subsets then are used to train the model and the last subset is kept as a validation set to test the model. Then the score of the model on each fold is averaged to evaluate the performance of the model.,Cross-Validation is just a method that simply reserves a part of data from the dataset and uses it for testing the model(Validation set), and the remaining data other than the reserved one is used to train the model.,Splitting a dataset into training and testing set is an essential and basic task when comes to getting a machine learning model ready for training. To determine if our model is overfitting or not we need to test it on unseen data (Validation set).,cross_val_score Class requires the Model, Dataset, Labels, and the cross-validation method as an input argument. you can know more about its functionality and methods here.

#Importing required libraries
from sklearn.datasets
import load_breast_cancer
import pandas as pd
from sklearn.model_selection
import KFold
from sklearn.linear_model
import LogisticRegression
from sklearn.metrics
import accuracy_score

#Loading the dataset
data = load_breast_cancer(as_frame = True)
df = data.frame
X = df.iloc[: ,: -1]
y = df.iloc[: , -1]

#Implementing cross validation

k = 5
kf = KFold(n_splits = k, random_state = None)
model = LogisticRegression(solver = 'liblinear')

acc_score = []

for train_index, test_index in kf.split(X):
   X_train, X_test = X.iloc[train_index,: ], X.iloc[test_index,: ]
y_train, y_test = y[train_index], y[test_index]

model.fit(X_train, y_train)
pred_values = model.predict(X_test)

acc = accuracy_score(pred_values, y_test)
acc_score.append(acc)

avg_acc_score = sum(acc_score) / k

print('accuracy of each fold - {}'.format(acc_score))
print('Avg accuracy : {}'.format(avg_acc_score))
accuracy of each fold - [0.9122807017543859, 0.9473684210526315, 0.9736842105263158, 0.9736842105263158, 0.9557522123893806]
Avg accuracy: 0.952553951249806
from sklearn.datasets
import load_breast_cancer
import pandas as pd
from sklearn.linear_model
import LogisticRegression
from sklearn.model_selection
import cross_val_score
from sklearn.model_selection
import KFold

data = load_breast_cancer(as_frame = True)
df = data.frame
X = df.iloc[: ,: -1]
y = df.iloc[: , -1]

k = 5
kf = model_selection.KFold(n_splits = k, random_state = None)
model = LogisticRegression(solver = 'liblinear')

result = cross_val_score(model, X, y, cv = kf)

print("Avg accuracy: {}".format(result.mean()))
Avg accuracy: 0.952553951249806

Suggestion : 3

Python - Creating Scatter Plot with IRIS Dataset - Data Analytics on Scatter plot Matplotlib Python Example,K-fold Cross-Validation with Python (using Cross-Validation Generators),K-fold Cross-Validation with Python (using Sklearn.cross_val_score),What and Why of K-fold Cross-ValidationWhy use the Cross-validation technique?

from sklearn.model_selection
import cross_val_score
from sklearn.pipeline
import make_pipeline
from sklearn.preprocessing
import StandardScaler
from sklearn.svm
import SVC
from sklearn.ensemble
import RandomForestClassifier
from sklearn.model_selection
import StratifiedKFold
#
# Create an instance of Pipeline
#
pipeline = make_pipeline(StandardScaler(), RandomForestClassifier(n_estimators = 100, max_depth = 4))
#
# Create an instance of StratifiedKFold which can be used to get indices of different training and test folds
#
strtfdKFold = StratifiedKFold(n_splits = 10)
kfold = strtfdKFold.split(X_train, y_train)
scores = []
#
#
#
for k, (train, test) in enumerate(kfold):
   pipeline.fit(X_train.iloc[train,: ], y_train.iloc[train])
score = pipeline.score(X_train.iloc[test,: ], y_train.iloc[test])
scores.append(score)
print('Fold: %2d, Training/Test Split Distribution: %s, Accuracy: %.3f' % (k + 1, np.bincount(y_train.iloc[train]), score))

print('\n\nCross-Validation accuracy: %.3f +/- %.3f' % (np.mean(scores), np.std(scores)))
from sklearn.model_selection
import cross_val_score
from sklearn.pipeline
import make_pipeline
from sklearn.preprocessing
import StandardScaler
from sklearn.svm
import SVC
from sklearn.ensemble
import RandomForestClassifier
#
# Create an instance of Pipeline
#
pipeline = make_pipeline(StandardScaler(), RandomForestClassifier(n_estimators = 100, max_depth = 4))
#
# Pass instance of pipeline and training and test data set
# cv = 10 represents the StratifiedKFold with 10 folds
#
scores = cross_val_score(pipeline, X = X_train, y = y_train, cv = 10, n_jobs = 1)

print('Cross Validation accuracy scores: %s' % scores)

print('Cross Validation accuracy: %.3f +/- %.3f' % (np.mean(scores), np.std(scores)))