IA qualidade do vinho

Algoritmo ML classificação da qualidade do vinho
Voltar

Anexos:


Importar bibliotecas:


import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
from termcolor import colored

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, KFold
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
print(colored('\nAll libraries imported succesfully.', 'green'))

pd.options.mode.copy_on_write = True #permite reescrever variáveis
sns.set_style('darkgrid') #estilo visual
warnings.filterwarnings('ignore') #ignorar alertas
pd.set_option('display.max_columns', None) #não mostrar todas colunas do dataframe
pd.set_option('display.max_colwidth', None) #não mostrar todos dados do dataframe

sns.color_palette("cool_r", n_colors=1)

sns.set_palette("cool_r")
print(colored('\nAll libraries Configed succesfully.', 'green'))

Coletar dados:


#importar dados, via pandas e método de leitura read_csv
data = pd.read_csv('/kaggle/input/red-wine-quality-cortez-et-al-2009/winequality-red.csv')
data.head()
data.info()
data.describe().T.style.background_gradient(axis=0)
data.isna().sum()

Visualização em gráficos:


#renomear conlunas para melhor utilização
data.rename(columns = {
    "fixed acidity": "fixed_acidity",
    "volatile acidity": "volatile_acidity",
    "citric acid": "citric_acid",
    "residual sugar": "residual_sugar",
    "chlorides": "chlorides",
    "free sulfur dioxide": "free_sulfur_dioxide",
    "total sulfur dioxide": "total_sulfur_dioxide"},
inplace = True)

columns = list(data.columns) #criar colunas do dataframe

fig, ax = plt.subplots(11, 2, figsize=(15, 45))
plt.subplots_adjust(hspace = 0.5)
for i in range(11):
    sns.boxplot(x=columns[i], data=data, ax=ax[i, 0]) #ax1-gráfico boxplot
    sns.scatterplot(x=columns[i], y='quality', data=data, hue='quality', ax=ax[i, 1]) #ax2-gráfico scatterplot

#não há valores faltantes
#não há outliers (dados que se diferenciam drasticamente de todos os outros)

corr = data.corr()
#cmap = sns.diverging_palette(-1, 1, s=100, l=50, n=15, center="dark", as_cmap=True)
plt.figure(figsize=(9, 6))
sns.heatmap(corr, annot=True, fmt='.2f', linewidth=0.5, cmap='Purples', mask=np.triu(corr)) #gráfico heatmap
plt.show()

sns.pairplot(data, hue='quality', corner = True, palette='Purples') #gráfico pairplot

Análises Plot:

Melhores correlações entre:
  • citeic_acid e flex_acidity: 0.67
  • density e flex_acidity: 0.67
  • total_sulfor_dioxide e free_sulfor_dioxide: 0.67

#categorizações - formação do atributo classe e seus valores
data.quality.unique()
data = data.replace({
    'quality': {
        8: 'Good',
        7: 'Good',
        6: 'Middle',
        5: 'Middle',
        4: 'Bad',
        3: 'Bad',
    }
})
data.head()

Normalização:


sns.histplot(data=data, x="fixed_acidity") #gráfico histplot
plt.show() #sem normalização

X_temp = data.drop(columns='quality') #criar colunas de atributos previsores
y = data.quality #criar coluna de atributo classe

scaler = MinMaxScaler(feature_range=(0, 1)).fit_transform(X_temp) #normalizar atributos previsores, para mantê-los entre 0 e 1
X = pd.DataFrame(scaler, columns=X_temp.columns)
X.describe().T.style.background_gradient(axis=0, cmap='Purples')

sns.histplot(data=X, x="fixed_acidity")
plt.show() #com normalização

Modelagem:


#função para gerar confusion matrix (dados teste, dados previstos)
def plot_confusion_matrix(y_test, y_prediction):
    cm = metrics.confusion_matrix(y_test, y_prediction)
    ax = plt.subplot()
    ax = sns.heatmap(cm, annot=True, fmt='', cmap="Purples")
    ax.set_xlabel('Prediced labels', fontsize=18)
    ax.set_ylabel('True labels', fontsize=18)
    ax.set_title('Confusion Matrix', fontsize=25)
    ax.xaxis.set_ticklabels(['Bad', 'Good', 'Middle'])
    ax.yaxis.set_ticklabels(['Bad', 'Good', 'Middle'])
    plt.show()

#função para criar classification report
def clfr_plot(y_test, y_pred):
    cr = pd.DataFrame(metrics.classification_report(y_test, y_pred_rf, digits=3, output_dict=True)).T
    cr.drop(columns='support', inplace=True)
    sns.heatmap(cr, cmap='Purples', annot=True, linecolor='white', linewidths=0.5).xaxis.tick_top()

def clf_plot(y_pred):
    #1.criar confusion matrix
    #2.criar classification report
    cm = metrics.confusion_matrix(y_test, y_pred)
    cr = pd.DataFrame(metrics.classification_report(y_test, y_pred, digits=3, output_dict=True)).T
    cr.drop(columns='support', inplace=True)

    fig, ax = plt.subplots(1, 2, figsize=(15, 5))

    #ax esquerdo: confusion matrix
    ax[0] = sns.heatmap(cm, annot=True, fmt='', cmap="Purples", ax=ax[0])
    ax[0].set_xlabel('Prediced labels', fontsize=18)
    ax[0].set_ylabel('True labels', fontsize=18)
    ax[0].set_title('Confusion Matrix', fontsize=25)
    ax[0].xaxis.set_ticklabels(['Bad', 'Good', 'Middle'])
    ax[0].yaxis.set_ticklabels(['Bad', 'Good', 'Middle'])

    #ax direito: classification report
    ax[1] = sns.heatmap(cr, cmap='Purples', annot=True, linecolor='white', linewidths=0.5, ax=ax[1])
    ax[1].xaxis.tick_top()
    ax[1].set_title('Classification Report', fontsize=25)
    plt.show()

data.quality.value_counts()

#dividir dataframe (25% dos dados para teste - geralmente é 80% treino e 20% teste). x são os previsores, y é classe
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)

Classificação via Random Forest:


#dicionário de parâmetros para testes no algoritmo
parameters = {
    'n_estimators': [50, 150, 500],
    'criterion': ['gini', 'entropy', 'log_loss'],
    'max_features': ['sqrt', 'log2']
}

rf = RandomForestClassifier(n_jobs=-1)
rf_cv = GridSearchCV(estimator=rf, cv=20, param_grid=parameters).fit(X_train, y_train)

print('Tuned hyper parameters: ', rf_cv.best_params_)
print('accuracy: ', rf_cv.best_score_)

rf = RandomForestClassifier(**rf_cv.best_params_).fit(X_train, y_train) #modelo (treino previsores-classe)

y_pred_rf = rf.predict(X_test)
rf_score = round(rf.score(X_test, y_test), 3) #modelo (teste previsores-classe)
print('RandomForestClassifier score: ', rf_score)

y_test.value_counts()
clf_plot(y_pred_rf)

Classificação via Logistic Regression:


parameters = {
    'C': [0.001, 0.01, 0.1, 1.0, 10, 100, 1000],
    'class_weight': ['balanced'],
    'solver': ['lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'sag', 'saga']
}

lr = LogisticRegression()
lr_cv = GridSearchCV(estimator=lr, param_grid=parameters, cv=10).fit(X_train, y_train)

print('Tuned hyper parameters: ', lr_cv.best_params_)
print('accuracy: ', lr_cv.best_score_)

lr = LogisticRegression(**lr_cv.best_params_).fit(X_train, y_train)

y_pred_lr = lr.predict(X_test)
lr_score = round(lr.score(X_test, y_test), 3)
print('LogisticRegression score: ', lr_score)
clf_plot(y_pred_lr)

Classificação via SVC (Support Vector Classifier):


parameters = {
    'C': [0.001, 0.01, 0.1, 1.0, 10, 100, 1000],
    'gamma': [0.001, 0.01, 0.1, 1.0, 10, 100, 1000],
}

svc = SVC()
svc_cv = GridSearchCV(estimator=svc, param_grid=parameters, cv=10).fit(X_train, y_train)

print('Tuned hyper parameters: ', svc_cv.best_params_)
print('accuracy: ', svc_cv.best_score_)

svc = SVC(**svc_cv.best_params_).fit(X_train, y_train) #reaplicação do modelo, com melhores parâmetros (treino previsores-classe)

y_pred_svc = svc.predict(X_test)
svc_score = round(svc.score(X_test, y_test), 3)
print('SVC Score: ', svc_score)
clf_plot(y_pred_svc)

Classificação via Decision Tree:


parameters = {
    'criterion': ['gini', 'entropy', 'log_loss'],
    'splitter': ['best', 'random'],
    'max_depth': list(np.arange(4, 30, 1))
}

tree = DecisionTreeClassifier()
tree_cv = GridSearchCV(estimator=tree, cv=10, param_grid=parameters).fit(X_train, y_train)

print('Tuned hyper parameters: ', tree_cv.best_params_)
print('accuracy: ', tree_cv.best_score_)

tree = DecisionTreeClassifier(**tree_cv.best_params_).fit(X_train, y_train)

y_pred_tree = tree.predict(X_test)
tree_score = round(tree.score(X_test, y_test), 3)
print('DecisionTreeClassifier Score: ', tree_score)
clf_plot(y_pred_tree)

Classificação via KNN (k-nearest neighbors):


parameters = {
    'n_neighbors': list(np.arange(3, 50, 2)),
    'weights': ['uniform', 'distance'],
    'p': [1, 2, 3, 4]
}

knn = KNeighborsClassifier()
knn_cv = GridSearchCV(estimator=knn, cv=10, param_grid=parameters).fit(X_train, y_train)

print('Tuned hyper parameters: ', knn_cv.best_params_)
print('accuracy: ', knn_cv.best_score_)

knn = KNeighborsClassifier(**knn_cv.best_params_).fit(X_train, y_train)

y_pred_knn = knn_cv.predict(X_test)
knn_score = round(knn.score(X_test, y_test), 3)
print('KNeighborsClassifier Score:', knn_score)
clf_plot(y_pred_knn)

Classificação via Gaussian Naïve Bayes:


gnb = GaussianNB().fit(X_train, y_train)
y_pred_gnb = gnb.predict(X_test)
gnb_score = round(gnb.score(X_test, y_test), 3)
print('GNB Score:', gnb_score)

clf_plot(y_pred_gnb)

Conclusões (resultado final):


result = pd.DataFrame({
    'Algorithm': ['RandomForestClassifier', 'LogisticRegression', 'SVC', 'DecisionTreeClassifier', 'KNeighborsClassifier', 'GaussianNB'],
    'Score': [rf_score, lr_score, svc_score, tree_score, knn_score, gnb_score]
})
result.sort_values(by='Score', inplace=True)
sns.set_palette("Purples")

fig, ax = plt.subplots(1, 1, figsize=(15, 5))
sns.barplot(x='Algorithm', y='Score', data=result)
ax.bar_label(ax.containers[0], fmt='%.3f')
ax.set_xticklabels(labels=result.Algorithm, rotation=300)
plt.show() #SVC possui maior precisão de acerto (score)

Modelagem final:


svc = SVC(**svc_cv.best_params_)
svc.fit(X, y) #reaplicação do modelo, com melhores parâmetros (previsores-classe)

Elaborado por Mateus Schwede
ubsocial.github.io