import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
from termcolor import colored
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, KFold
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
print(colored('\nAll libraries imported succesfully.', 'green'))
pd.options.mode.copy_on_write = True #permite reescrever variáveis
sns.set_style('darkgrid') #estilo visual
warnings.filterwarnings('ignore') #ignorar alertas
pd.set_option('display.max_columns', None) #não mostrar todas colunas do dataframe
pd.set_option('display.max_colwidth', None) #não mostrar todos dados do dataframe
sns.color_palette("cool_r", n_colors=1)
sns.set_palette("cool_r")
print(colored('\nAll libraries Configed succesfully.', 'green'))
#importar dados, via pandas e método de leitura read_csv
data = pd.read_csv('/kaggle/input/red-wine-quality-cortez-et-al-2009/winequality-red.csv')
data.head()
data.info()
data.describe().T.style.background_gradient(axis=0)
data.isna().sum()
#renomear conlunas para melhor utilização
data.rename(columns = {
"fixed acidity": "fixed_acidity",
"volatile acidity": "volatile_acidity",
"citric acid": "citric_acid",
"residual sugar": "residual_sugar",
"chlorides": "chlorides",
"free sulfur dioxide": "free_sulfur_dioxide",
"total sulfur dioxide": "total_sulfur_dioxide"},
inplace = True)
columns = list(data.columns) #criar colunas do dataframe
fig, ax = plt.subplots(11, 2, figsize=(15, 45))
plt.subplots_adjust(hspace = 0.5)
for i in range(11):
sns.boxplot(x=columns[i], data=data, ax=ax[i, 0]) #ax1-gráfico boxplot
sns.scatterplot(x=columns[i], y='quality', data=data, hue='quality', ax=ax[i, 1]) #ax2-gráfico scatterplot
#não há valores faltantes
#não há outliers (dados que se diferenciam drasticamente de todos os outros)
corr = data.corr()
#cmap = sns.diverging_palette(-1, 1, s=100, l=50, n=15, center="dark", as_cmap=True)
plt.figure(figsize=(9, 6))
sns.heatmap(corr, annot=True, fmt='.2f', linewidth=0.5, cmap='Purples', mask=np.triu(corr)) #gráfico heatmap
plt.show()
sns.pairplot(data, hue='quality', corner = True, palette='Purples') #gráfico pairplot
#categorizações - formação do atributo classe e seus valores
data.quality.unique()
data = data.replace({
'quality': {
8: 'Good',
7: 'Good',
6: 'Middle',
5: 'Middle',
4: 'Bad',
3: 'Bad',
}
})
data.head()
sns.histplot(data=data, x="fixed_acidity") #gráfico histplot
plt.show() #sem normalização
X_temp = data.drop(columns='quality') #criar colunas de atributos previsores
y = data.quality #criar coluna de atributo classe
scaler = MinMaxScaler(feature_range=(0, 1)).fit_transform(X_temp) #normalizar atributos previsores, para mantê-los entre 0 e 1
X = pd.DataFrame(scaler, columns=X_temp.columns)
X.describe().T.style.background_gradient(axis=0, cmap='Purples')
sns.histplot(data=X, x="fixed_acidity")
plt.show() #com normalização
#função para gerar confusion matrix (dados teste, dados previstos)
def plot_confusion_matrix(y_test, y_prediction):
cm = metrics.confusion_matrix(y_test, y_prediction)
ax = plt.subplot()
ax = sns.heatmap(cm, annot=True, fmt='', cmap="Purples")
ax.set_xlabel('Prediced labels', fontsize=18)
ax.set_ylabel('True labels', fontsize=18)
ax.set_title('Confusion Matrix', fontsize=25)
ax.xaxis.set_ticklabels(['Bad', 'Good', 'Middle'])
ax.yaxis.set_ticklabels(['Bad', 'Good', 'Middle'])
plt.show()
#função para criar classification report
def clfr_plot(y_test, y_pred):
cr = pd.DataFrame(metrics.classification_report(y_test, y_pred_rf, digits=3, output_dict=True)).T
cr.drop(columns='support', inplace=True)
sns.heatmap(cr, cmap='Purples', annot=True, linecolor='white', linewidths=0.5).xaxis.tick_top()
def clf_plot(y_pred):
#1.criar confusion matrix
#2.criar classification report
cm = metrics.confusion_matrix(y_test, y_pred)
cr = pd.DataFrame(metrics.classification_report(y_test, y_pred, digits=3, output_dict=True)).T
cr.drop(columns='support', inplace=True)
fig, ax = plt.subplots(1, 2, figsize=(15, 5))
#ax esquerdo: confusion matrix
ax[0] = sns.heatmap(cm, annot=True, fmt='', cmap="Purples", ax=ax[0])
ax[0].set_xlabel('Prediced labels', fontsize=18)
ax[0].set_ylabel('True labels', fontsize=18)
ax[0].set_title('Confusion Matrix', fontsize=25)
ax[0].xaxis.set_ticklabels(['Bad', 'Good', 'Middle'])
ax[0].yaxis.set_ticklabels(['Bad', 'Good', 'Middle'])
#ax direito: classification report
ax[1] = sns.heatmap(cr, cmap='Purples', annot=True, linecolor='white', linewidths=0.5, ax=ax[1])
ax[1].xaxis.tick_top()
ax[1].set_title('Classification Report', fontsize=25)
plt.show()
data.quality.value_counts()
#dividir dataframe (25% dos dados para teste - geralmente é 80% treino e 20% teste). x são os previsores, y é classe
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)
#dicionário de parâmetros para testes no algoritmo
parameters = {
'n_estimators': [50, 150, 500],
'criterion': ['gini', 'entropy', 'log_loss'],
'max_features': ['sqrt', 'log2']
}
rf = RandomForestClassifier(n_jobs=-1)
rf_cv = GridSearchCV(estimator=rf, cv=20, param_grid=parameters).fit(X_train, y_train)
print('Tuned hyper parameters: ', rf_cv.best_params_)
print('accuracy: ', rf_cv.best_score_)
rf = RandomForestClassifier(**rf_cv.best_params_).fit(X_train, y_train) #modelo (treino previsores-classe)
y_pred_rf = rf.predict(X_test)
rf_score = round(rf.score(X_test, y_test), 3) #modelo (teste previsores-classe)
print('RandomForestClassifier score: ', rf_score)
y_test.value_counts()
clf_plot(y_pred_rf)
parameters = {
'C': [0.001, 0.01, 0.1, 1.0, 10, 100, 1000],
'class_weight': ['balanced'],
'solver': ['lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'sag', 'saga']
}
lr = LogisticRegression()
lr_cv = GridSearchCV(estimator=lr, param_grid=parameters, cv=10).fit(X_train, y_train)
print('Tuned hyper parameters: ', lr_cv.best_params_)
print('accuracy: ', lr_cv.best_score_)
lr = LogisticRegression(**lr_cv.best_params_).fit(X_train, y_train)
y_pred_lr = lr.predict(X_test)
lr_score = round(lr.score(X_test, y_test), 3)
print('LogisticRegression score: ', lr_score)
clf_plot(y_pred_lr)
parameters = {
'C': [0.001, 0.01, 0.1, 1.0, 10, 100, 1000],
'gamma': [0.001, 0.01, 0.1, 1.0, 10, 100, 1000],
}
svc = SVC()
svc_cv = GridSearchCV(estimator=svc, param_grid=parameters, cv=10).fit(X_train, y_train)
print('Tuned hyper parameters: ', svc_cv.best_params_)
print('accuracy: ', svc_cv.best_score_)
svc = SVC(**svc_cv.best_params_).fit(X_train, y_train) #reaplicação do modelo, com melhores parâmetros (treino previsores-classe)
y_pred_svc = svc.predict(X_test)
svc_score = round(svc.score(X_test, y_test), 3)
print('SVC Score: ', svc_score)
clf_plot(y_pred_svc)
parameters = {
'criterion': ['gini', 'entropy', 'log_loss'],
'splitter': ['best', 'random'],
'max_depth': list(np.arange(4, 30, 1))
}
tree = DecisionTreeClassifier()
tree_cv = GridSearchCV(estimator=tree, cv=10, param_grid=parameters).fit(X_train, y_train)
print('Tuned hyper parameters: ', tree_cv.best_params_)
print('accuracy: ', tree_cv.best_score_)
tree = DecisionTreeClassifier(**tree_cv.best_params_).fit(X_train, y_train)
y_pred_tree = tree.predict(X_test)
tree_score = round(tree.score(X_test, y_test), 3)
print('DecisionTreeClassifier Score: ', tree_score)
clf_plot(y_pred_tree)
parameters = {
'n_neighbors': list(np.arange(3, 50, 2)),
'weights': ['uniform', 'distance'],
'p': [1, 2, 3, 4]
}
knn = KNeighborsClassifier()
knn_cv = GridSearchCV(estimator=knn, cv=10, param_grid=parameters).fit(X_train, y_train)
print('Tuned hyper parameters: ', knn_cv.best_params_)
print('accuracy: ', knn_cv.best_score_)
knn = KNeighborsClassifier(**knn_cv.best_params_).fit(X_train, y_train)
y_pred_knn = knn_cv.predict(X_test)
knn_score = round(knn.score(X_test, y_test), 3)
print('KNeighborsClassifier Score:', knn_score)
clf_plot(y_pred_knn)
gnb = GaussianNB().fit(X_train, y_train)
y_pred_gnb = gnb.predict(X_test)
gnb_score = round(gnb.score(X_test, y_test), 3)
print('GNB Score:', gnb_score)
clf_plot(y_pred_gnb)
result = pd.DataFrame({
'Algorithm': ['RandomForestClassifier', 'LogisticRegression', 'SVC', 'DecisionTreeClassifier', 'KNeighborsClassifier', 'GaussianNB'],
'Score': [rf_score, lr_score, svc_score, tree_score, knn_score, gnb_score]
})
result.sort_values(by='Score', inplace=True)
sns.set_palette("Purples")
fig, ax = plt.subplots(1, 1, figsize=(15, 5))
sns.barplot(x='Algorithm', y='Score', data=result)
ax.bar_label(ax.containers[0], fmt='%.3f')
ax.set_xticklabels(labels=result.Algorithm, rotation=300)
plt.show() #SVC possui maior precisão de acerto (score)
svc = SVC(**svc_cv.best_params_)
svc.fit(X, y) #reaplicação do modelo, com melhores parâmetros (previsores-classe)
Elaborado por Mateus Schwede
ubsocial.github.io