citation("palmerpenguins")
#> 
#> To cite palmerpenguins in publications use:
#> 
#>   Horst AM, Hill AP, Gorman KB (2020). palmerpenguins: Palmer
#>   Archipelago (Antarctica) penguin data. R package version 0.1.0.
#>   https://allisonhorst.github.io/palmerpenguins/. doi:
#>   10.5281/zenodo.3960218.
#> 
#> A BibTeX entry for LaTeX users is
#> 
#>   @Manual{,
#>     title = {palmerpenguins: Palmer Archipelago (Antarctica) penguin data},
#>     author = {Allison Marie Horst and Alison Presmanes Hill and Kristen B Gorman},
#>     year = {2020},
#>     note = {R package version 0.1.0},
#>     doi = {10.5281/zenodo.3960218},
#>     url = {https://allisonhorst.github.io/palmerpenguins/},
#>   }

https://github.com/allisonhorst/palmerpenguins


# Tratamiento de datos
# ==============================================================================
import pandas as pd
import numpy as np
from sklearn import metrics

# Gráficos
# ==============================================================================
import matplotlib.pyplot as plt
from matplotlib import style
import seaborn as sns

# Preprocesado y modelado
# ==============================================================================
from sklearn.feature_selection import mutual_info_classif, SelectKBest
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler

from sklearn.metrics import accuracy_score
from sklearn.metrics import plot_confusion_matrix

# Configuración matplotlib
# ==============================================================================
plt.rcParams['image.cmap'] = "bwr"
#plt.rcParams['figure.dpi'] = "100"
plt.rcParams['savefig.bbox'] = "tight"
style.use('ggplot') or plt.style.use('ggplot')

# Configuración warnings
# ==============================================================================
import warnings
warnings.filterwarnings('ignore')

# saving the model 
import pickle


data = pd.read_csv('penguins.csv')


data.rename({'Culmen Length (mm)': 'Culmen_Length', 'Culmen Depth (mm)': 'Culmen_Depth'}, axis=1, inplace=True)


X = data.drop(['studyName', 'Sample Number', 'Region', 'Island', 'Stage','Species',
       'Individual ID', 'Clutch Completion', 'Date Egg', 'Flipper Length (mm)', 'Body Mass (g)', 'Sex',
       'Delta 15 N (o/oo)', 'Delta 13 C (o/oo)', 'Comments'], axis=1)


X.columns

Index(['Culmen_Length', 'Culmen_Depth'], dtype='object')


data['Species'].unique()

array(['Adelie Penguin (Pygoscelis adeliae)',
       'Gentoo penguin (Pygoscelis papua)',
       'Chinstrap penguin (Pygoscelis antarctica)'], dtype=object)


# Target variable can also be encoded using sklearn.preprocessing.LabelEncoder
data['Species']=data['Species'].map({'Adelie Penguin (Pygoscelis adeliae)':0,'Gentoo penguin (Pygoscelis papua)':1,'Chinstrap penguin (Pygoscelis antarctica)':2})


data['Species'].unique()

array([0, 1, 2])


y = data['Species']


X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, train_size = 0.8, random_state = 1234)


def model(X, y):
    X = X.copy()
    y = y.copy()
    pipeline = Pipeline(steps=[['scaler', MinMaxScaler()],
                               ['feature_selection', SelectKBest(score_func=mutual_info_classif)],
                               ['classifier', LogisticRegression(random_state=11, max_iter=1000)]])
    
    param_grid = {'feature_selection__k': range(1, X.shape[1]),
                  'classifier__C': [0.001, 0.01, 0.1, 1, 10, 100, 1000]}
    
    grid_search = GridSearchCV(estimator=pipeline,
                               param_grid=param_grid,
                               scoring='accuracy',
                               n_jobs=-1,
                               cv=3)
    
    grid_search.fit(X, y)
    
    return grid_search


pipeline = Pipeline(steps=[['scaler', MinMaxScaler()],
                           ['feature_selection', SelectKBest(score_func=mutual_info_classif,
                                                             k=2)],
                           ['classifier', LogisticRegression(random_state=11,
                                                             max_iter=1000,
                                                             C=1000)]])

#Refitting the pipeline to the data to find features selected
pipeline.fit(X_train, y_train)
feature_selection = (pipeline['feature_selection']).scores_
feature_scores = {key: value for key,value in zip(X_train.columns, (np.round(pipeline['feature_selection'].scores_,2)))}


# saving the model 
#Fitting the final model from GridSearchCV
penguins_model = model(X_train, y_train)
pickle_out = open("classifier_penguins.pkl", mode = "wb") 
pickle.dump(penguins_model, pickle_out) 
pickle_out.close()

print(f'Best params: {penguins_model.best_params_}\nBest score: {penguins_model.best_score_}\nFeatures scores: {feature_scores}')

Best params: {'classifier__C': 10, 'feature_selection__k': 1}
Best score: 0.7527074374900463
Features scores: {'Culmen_Length': 0.53, 'Culmen_Depth': 0.54}


y_pred = penguins_model.predict(X_test)


y_pred

array([0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 2, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1,
       0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1,
       1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0,
       1, 1, 0])


cnf_matrix = metrics.confusion_matrix(y_test, y_pred)
cnf_matrix

array([[29,  1,  0],
       [ 3, 22,  0],
       [ 2, 11,  1]])

Classification penguins datasets¶

EL Modelo classifier_penguins.pkl se guardará para ser utilizado para realizar el deploy en Streamlit¶

El app de se encuentra el github: https://github.com/sandrarairan/Classification_Penguins/tree/master ¶

Classification penguins datasets¶

EL Modelo classifier_penguins.pkl se guardará para ser utilizado para realizar el deploy en Streamlit¶

El app de se encuentra el github: https://github.com/sandrarairan/Classification_Penguins/tree/master¶

El app de se encuentra el github: https://github.com/sandrarairan/Classification_Penguins/tree/master ¶