citation("palmerpenguins")
#>
#> To cite palmerpenguins in publications use:
#>
#> Horst AM, Hill AP, Gorman KB (2020). palmerpenguins: Palmer
#> Archipelago (Antarctica) penguin data. R package version 0.1.0.
#> https://allisonhorst.github.io/palmerpenguins/. doi:
#> 10.5281/zenodo.3960218.
#>
#> A BibTeX entry for LaTeX users is
#>
#> @Manual{,
#> title = {palmerpenguins: Palmer Archipelago (Antarctica) penguin data},
#> author = {Allison Marie Horst and Alison Presmanes Hill and Kristen B Gorman},
#> year = {2020},
#> note = {R package version 0.1.0},
#> doi = {10.5281/zenodo.3960218},
#> url = {https://allisonhorst.github.io/palmerpenguins/},
#> }
https://github.com/allisonhorst/palmerpenguins
# Tratamiento de datos
# ==============================================================================
import pandas as pd
import numpy as np
from sklearn import metrics
# Gráficos
# ==============================================================================
import matplotlib.pyplot as plt
from matplotlib import style
import seaborn as sns
# Preprocesado y modelado
# ==============================================================================
from sklearn.feature_selection import mutual_info_classif, SelectKBest
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score
from sklearn.metrics import plot_confusion_matrix
# Configuración matplotlib
# ==============================================================================
plt.rcParams['image.cmap'] = "bwr"
#plt.rcParams['figure.dpi'] = "100"
plt.rcParams['savefig.bbox'] = "tight"
style.use('ggplot') or plt.style.use('ggplot')
# Configuración warnings
# ==============================================================================
import warnings
warnings.filterwarnings('ignore')
# saving the model
import pickle
data = pd.read_csv('penguins.csv')
data.rename({'Culmen Length (mm)': 'Culmen_Length', 'Culmen Depth (mm)': 'Culmen_Depth'}, axis=1, inplace=True)
X = data.drop(['studyName', 'Sample Number', 'Region', 'Island', 'Stage','Species',
'Individual ID', 'Clutch Completion', 'Date Egg', 'Flipper Length (mm)', 'Body Mass (g)', 'Sex',
'Delta 15 N (o/oo)', 'Delta 13 C (o/oo)', 'Comments'], axis=1)
X.columns
Index(['Culmen_Length', 'Culmen_Depth'], dtype='object')
data['Species'].unique()
array(['Adelie Penguin (Pygoscelis adeliae)',
'Gentoo penguin (Pygoscelis papua)',
'Chinstrap penguin (Pygoscelis antarctica)'], dtype=object)
# Target variable can also be encoded using sklearn.preprocessing.LabelEncoder
data['Species']=data['Species'].map({'Adelie Penguin (Pygoscelis adeliae)':0,'Gentoo penguin (Pygoscelis papua)':1,'Chinstrap penguin (Pygoscelis antarctica)':2})
data['Species'].unique()
array([0, 1, 2])
y = data['Species']
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, train_size = 0.8, random_state = 1234)
def model(X, y):
X = X.copy()
y = y.copy()
pipeline = Pipeline(steps=[['scaler', MinMaxScaler()],
['feature_selection', SelectKBest(score_func=mutual_info_classif)],
['classifier', LogisticRegression(random_state=11, max_iter=1000)]])
param_grid = {'feature_selection__k': range(1, X.shape[1]),
'classifier__C': [0.001, 0.01, 0.1, 1, 10, 100, 1000]}
grid_search = GridSearchCV(estimator=pipeline,
param_grid=param_grid,
scoring='accuracy',
n_jobs=-1,
cv=3)
grid_search.fit(X, y)
return grid_search
pipeline = Pipeline(steps=[['scaler', MinMaxScaler()],
['feature_selection', SelectKBest(score_func=mutual_info_classif,
k=2)],
['classifier', LogisticRegression(random_state=11,
max_iter=1000,
C=1000)]])
#Refitting the pipeline to the data to find features selected
pipeline.fit(X_train, y_train)
feature_selection = (pipeline['feature_selection']).scores_
feature_scores = {key: value for key,value in zip(X_train.columns, (np.round(pipeline['feature_selection'].scores_,2)))}
# saving the model
#Fitting the final model from GridSearchCV
penguins_model = model(X_train, y_train)
pickle_out = open("classifier_penguins.pkl", mode = "wb")
pickle.dump(penguins_model, pickle_out)
pickle_out.close()
print(f'Best params: {penguins_model.best_params_}\nBest score: {penguins_model.best_score_}\nFeatures scores: {feature_scores}')
Best params: {'classifier__C': 10, 'feature_selection__k': 1}
Best score: 0.7527074374900463
Features scores: {'Culmen_Length': 0.53, 'Culmen_Depth': 0.54}
y_pred = penguins_model.predict(X_test)
y_pred
array([0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 2, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1,
0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1,
1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0,
1, 1, 0])
cnf_matrix = metrics.confusion_matrix(y_test, y_pred)
cnf_matrix
array([[29, 1, 0],
[ 3, 22, 0],
[ 2, 11, 1]])