Kaggle预测房价知识点 01数据预处理

Stacked Regressions : Top 4% on LeaderBoard

1. subprocess的check_output模块，用来得到命令行的输出结果

# kaggle代码: 用来输出显示目录下的文件
print(check_output(["ls", "../input"]).decode("utf8"))

# 示例代码: 对命令行输出的结果进行操作
output = subprocess.check_output(["python3", "xx.py"], shell = False)
if (output.find("yes") >= 0): print("yes")
else: print("no")

2. csv操作

train = pd.read_csv('../input/train.csv')
# 显示csv的前五行
train.head(5)
# 丢弃ID列
train.drop("Id", axis = 1, inplace = True)
# 删除特定数据
train = train.drop(train[(train['GrLivArea']>4000) & (train['SalePrice']<300000)].index)
# 连接数据
all_data = pd.concat((train, test)).reset_index(drop=True)

3. 可视化

fig, ax = plt.subplots()
ax.scatter(x = train['GrLivArea'], y = train['SalePrice'])
plt.ylabel('SalePrice', fontsize=13)
plt.xlabel('GrLivArea', fontsize=13)
plt.show()

可视化xy数据

# seaborn可视化数据分布
sns.distplot(train['SalePrice'] , fit=norm);

# Get the fitted parameters used by the function
(mu, sigma) = norm.fit(train['SalePrice'])
print( '
 mu = {:.2f} and sigma = {:.2f}
'.format(mu, sigma))

#Now plot the distribution
plt.legend(['Normal dist. ($mu=$ {:.2f} and $sigma=$ {:.2f} )'.format(mu, sigma)],
            loc='best')
plt.ylabel('Frequency')
plt.title('SalePrice distribution')

#Get also the QQ-plot
fig = plt.figure()
res = stats.probplot(train['SalePrice'], plot=plt)
plt.show()

4. NULL值检查、处理

all_data_na = (all_data.isnull().sum() / len(all_data)) * 100
all_data_na = all_data_na.drop(all_data_na[all_data_na == 0].index).sort_values(ascending=False)[:30]
missing_data = pd.DataFrame({'Missing Ratio' :all_data_na})
missing_data.head(20)

# 以None代替
for col in ('GarageType', 'GarageFinish', 'GarageQual', 'GarageCond'):
    all_data[col] = all_data[col].fillna('None')
# 以临近值代替
all_data["LotFrontage"] = all_data.groupby("Neighborhood")["LotFrontage"].transform(
    lambda x: x.fillna(x.median()))

5. 数据关联性检查

corrmat = train.corr()
plt.subplots(figsize=(12,9))
sns.heatmap(corrmat, vmax=0.9, square=True)

6. Label Encoding

from sklearn.preprocessing import LabelEncoder
cols = ('FireplaceQu', 'BsmtQual', 'BsmtCond', 'GarageQual', 'GarageCond', 
        'ExterQual', 'ExterCond','HeatingQC', 'PoolQC', 'KitchenQual', 'BsmtFinType1', 
        'BsmtFinType2', 'Functional', 'Fence', 'BsmtExposure', 'GarageFinish', 'LandSlope',
        'LotShape', 'PavedDrive', 'Street', 'Alley', 'CentralAir', 'MSSubClass', 'OverallCond', 
        'YrSold', 'MoSold')
# process columns, apply LabelEncoder to categorical features
for c in cols:
    lbl = LabelEncoder() 
    lbl.fit(list(all_data[c].values)) 
    all_data[c] = lbl.transform(list(all_data[c].values))

# shape        
print('Shape all_data: {}'.format(all_data.shape))

7. OneHot Encoding

all_data = pd.get_dummies(all_data)
print(all_data.shape)