#目录,印第安人糖尿病数据集合
#机器学习,模型评估二
#数据来源Pima Indias Diabetes dataset
#setp1,数据预处理
import pandas as pd
path = '/Users/xuwen/Downloads/diabetes.csv'
pima = pd.read_csv(path)
pima.head()
#step1
#X,y赋值
#大写X,小写y
#我们需要的四个特征
feature_names = ['Pregnancies','Insulin','BMI','Age']
X = pima[feature_names]
y = pima.Outcome
#step2维度确认
print(X.shape)
print(y.shape)
#step3,数据分离
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,random_state= 0)
#step4,模型训练
#逻辑回归模型
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()
logreg.fit(X_train,y_train)
#测试数据集结果预测
y_pred = logreg.predict(X_test)
#使用准确率进行评估
from sklearn import metrics
print(metrics.accuracy_score(y_test,y_pred))
#确认正负样本数量
y_test.value_counts()
# 1的比例
y_test.mean()
#0的比例
1 - y_test.mean()
#空准确率
max(y_test.mean(),1-y_test.mean())
#混淆矩阵,误差矩阵
print(metrics.confusion_matrix(y_test,y_pred))
#实际结果与预测结果,25组
print("true:",y_test.values[0:25])
print("pred:",y_pred[0:25])
#四个因子赋值
confusion = metrics.confusion_matrix(y_test,y_pred)
TN = confusion[0,0]
FP = confusion[0,1]
FN = confusion[1,0]
TP = confusion[1,1]
print(TN,FP,FN,TP)
#准确率,Accuracy = (TP+TN)/(TP+TN+FP+FN)
accuracy = (TP+TN)/(TP+TN+FP+FN)
print(accuracy)
print(metrics.accuracy_score(y_test,y_pred))
#错误率,Misclassification Rate = (FP+FN)/(TP+TN+FP+FN)
mis_rate = (FP+FN)/(TP+TN+FP+FN)
print(mis_rate)
print(1-metrics.accuracy_score(y_test,y_pred))
#召回率recall,Sensitivity = Recall = TP/(TP+FN)
recall = TP/(TP+FN)
print(recall)


