芯が強い人になるESTJ-A

# ML数据清洗

IT開発 Tags: 无标签 阅读: 153

#很容易数据维度报错
print(type(x),x.shape)

print(type(y),y.shape)

#由于数据不匹配,会导致报错
import numpy as np
x = np.array(x)
x = x.reshape(-1,1)
print(type(x),x.shape)

截屏2024-08-23 17.07.54.png
截屏2024-08-24 14.33.19.png

截屏2024-08-24 15.38.48.png

#load the data
import numpy as pd
import pandas as pd
data = pd.read_csv("//Users/xuwen/downloads/imooc/Chapter3/examdata.csv")
data.head()

print(type(data),data.shape)

#可视化versulize the data
from matplotlib import pyplot as plt
fig1 = plt.figure()
plt.scatter(data.loc[:,'Exam1'],data.loc[:,'Exam2'])
plt.title('Exam1--Exam2')
plt.xlabel('Exam1')
plt.ylabel('Exam2')
plt.show()


#add label mask
mask = data.loc[:,'Pass']==1
print(mask)

#取反
print(~mask)

fig2 = plt.figure()
passed = plt.scatter(data.loc[:,'Exam1'][mask],data.loc[:,'Exam2'][mask])
failed = plt.scatter(data.loc[:,'Exam1'][~mask],data.loc[:,'Exam2'][~mask])
plt.title('Exam1 -Exam2')
plt.xlabel('Exam1')
plt.ylabel('Exam2')
plt.legend((passed,failed),('passed','failed'))
plt.show()