https://github.com/HousePricesPredictionTeam/BIT_HousePricesPredict
Kaggle房价预测:House Prices: Advanced Regression Techniques
房价与我们的生活息息相关,房价的波动牵动着无数购房者的心。如果能够预测房价的走势,可以有效地帮助购买者做出合理的决策。本项目中,我们选择kaggle竞赛中的爱荷华州艾姆斯住宅数据集,数据集中有 79 个变量几乎描述了爱荷华州艾姆斯 (Ames, Iowa) 住宅的方方面面。我们将对数据集建模分析,并对房价进行预测。
影响房价的因素有很多,我们将进行数据可视化来分析各个因素对房价的影响,使用特征工程选择最相关的因素,利用多个机器学习算法(如决策树回归、xgboost等 )构建房价回归模型,并对比分析预测结果。
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
data=pd.read_csv('house-prices-advanced-regression-techniques/train.csv')
data
data.shape
for i in data.columns[1:]:
if data[i].isnull().sum()>0:
print(i,data[i].isnull().sum())
Deldata= data[[column for column in data if data[column].count()/len(data)>=0.3]]
del Deldata['Id']
print("删除的属性", end=" ")
for c in data.columns:
if c not in Deldata.columns:
print(c, end=", ")
print('\n')
data = Deldata
data.shape
isnull = data.isnull().sum(axis=0)
isnull[isnull > 0]/data.shape[0]
data.fillna(method='pad', inplace=True)
data
numCol='LotFrontage/LotArea/MasVnrArea/BsmtFinSF1/BsmtFinSF2/BsmtUnfSF/TotalBsmtSF/1stFlrSF/2ndFlrSF/LowQualFinSF/GrLivArea/BsmtFullBath/BsmtHalfBath/FullBath/HalfBath/BedroomAbvGr/KitchenAbvGr/TotRmsAbvGrd/Fireplaces/GarageCars/GarageArea/WoodDeckSF/OpenPorchSF/EnclosedPorch/3SsnPorch/ScreenPorch/PoolArea/MiscVal/MoSold/YrSold/YearBuilt/YearRemodAdd/GarageYrBlt/SalePrice'
num=numCol.split('/')
numFeature=data[num]
numFeature.shape
data[numFeature.columns].describe().loc['min':'max'].T
fig, axes = plt.subplots(7, 5, figsize=(10, 30))
#, figsize=(15, 20)
for i,col in enumerate(numFeature.columns):
axes[i//5, i%5].boxplot(data[col].values)
axes[i//5, i%5].set_title(col)
fig.tight_layout()
numFeature.hist(figsize=(16, 20), bins=50, xlabelsize=8, ylabelsize=8)
Nominal='MSSubClass/MSZoning/Street/LotShape/LandContour/Utilities/LotConfig/LandSlope/Neighborhood/Condition1/Condition2/BldgType/HouseStyle/OverallQual/OverallCond/RoofStyle/RoofMatl/Exterior1st/Exterior2nd/MasVnrType/ExterQual/ExterCond/Foundation/BsmtQual/BsmtCond/BsmtExposure/BsmtFinType1/BsmtFinType2/Heating/HeatingQC/CentralAir/Electrical/KitchenQual/Functional/FireplaceQu/GarageType/GarageFinish/GarageQual/GarageCond/PavedDrive/SaleType/SaleCondition'
NomFeature=data[Nominal.split('/')]
NomFeature.shape
fig, axes = plt.subplots(14, 3, figsize=(15, 20))
#, figsize=(15, 20)
for i,col in enumerate(NomFeature.columns):
c = data[col].value_counts()[:10]
axes[i//3, i%3].bar(c.index,c.values)
axes[i//3, i%3].set_title(col)
fig.tight_layout()
correlation1 = numFeature.corr()
f , ax = plt.subplots(figsize = (14,12))
plt.title('Correlation of Numeric Features with Sale Price',y=1,size=16)
sns.heatmap(correlation1,square = True, vmax=0.8)
k= 10
cols1 = correlation1.nlargest(k,'SalePrice')['SalePrice'].index
print(cols1)
cm = np.corrcoef(data[cols1].values.T)
f , ax = plt.subplots(figsize = (14,12))
sns.heatmap(cm, vmax=.8, linewidths=0.01,square=True,annot=True,cmap='viridis',
linecolor="white",xticklabels = cols1.values ,annot_kws = {'size':12},yticklabels = cols1.values)
numeric_feature = data[['GrLivArea', 'GarageCars', 'TotalBsmtSF', 'FullBath', 'TotRmsAbvGrd', 'YearBuilt', 'YearRemodAdd']]
numeric_feature.to_csv('numeric_feature.csv', index=False)
from sklearn.preprocessing import LabelEncoder
number = LabelEncoder()
data2=data.copy()
for i in NomFeature.columns:
if data2[i].dtype=='O':
data2[i]=number.fit_transform(data[i].astype('str'))
data2
Corr='MSSubClass/MSZoning/Street/LotShape/LandContour/Utilities/LotConfig/LandSlope/Neighborhood/Condition1/Condition2/BldgType/HouseStyle/OverallQual/OverallCond/RoofStyle/RoofMatl/Exterior1st/Exterior2nd/MasVnrType/ExterQual/ExterCond/Foundation/BsmtQual/BsmtCond/BsmtExposure/BsmtFinType1/BsmtFinType2/Heating/HeatingQC/CentralAir/Electrical/KitchenQual/Functional/FireplaceQu/GarageType/GarageFinish/GarageQual/GarageCond/PavedDrive/SaleType/SaleCondition/SalePrice'
NomFeatures=data2[Corr.split('/')]
correlation2 = NomFeatures.corr()
f , ax = plt.subplots(figsize = (14,12))
plt.title('Correlation of Numeric Features with Sale Price',y=1,size=16)
sns.heatmap(correlation2,square = True, vmax=0.8)
k= 10
cols = correlation2.nlargest(k,'SalePrice')['SalePrice'].index
print(cols)
cm = np.corrcoef(data2[cols].values.T)
f , ax = plt.subplots(figsize = (14,12))
sns.heatmap(cm, vmax=.8, linewidths=0.01,square=True,annot=True,cmap='viridis',
linecolor="white",xticklabels = cols.values ,annot_kws = {'size':12},yticklabels = cols.values)
catData = pd.get_dummies(NomFeature.astype(str))
catData.shape
catData
catData.to_csv('normal_feature_corr.csv', index=False)
分析数值属性和SalePrice的相关性可以使用连续性变量之间的相关分析方法:相关系数
分析标称属性和SalePrice的相关性可以先把SalePrice离散化,然后使用离散型变量的相关分析方法:信息增益比
data.SalePrice.hist()
# 连续 => 离散
def price_map(x):
if x <= 108000:
return 0
elif x <= 170000:
return 1
elif x <= 250000:
return 2
elif x <= 310000:
return 3
else:
return 4
sale_price = data.SalePrice.map(price_map)
data['sale_price'] = sale_price
sale_price.hist()
import numpy as np
def Entropy(x):
x = np.array(x)
x = x/sum(x)
return -(x*np.log2(x)).sum()
def gain_rate(data, colA, colB):
weight = data.groupby([colA]).size()/data.shape[0]
entropy_A = Entropy(data[colA].value_counts())
entropy_B = Entropy(data[colB].value_counts())
gain = entropy_A - (weight*data.groupby([colA]).apply(lambda x: Entropy(x[colB].value_counts()))).sum()
return gain/(entropy_A*entropy_B)
value_counts = NomFeature.apply(lambda x: x.value_counts().size, axis=0)
idx = value_counts[value_counts < 20].index
gains = []
for col in idx:
gains.append([col, gain_rate(data, col, 'sale_price')])
gains = pd.DataFrame(gains, columns=['attribute', 'gainRate'])
attributes = gains.sort_values(by=['gainRate'], ascending=False).values[1:10, 0]
gains.sort_values(by=['gainRate'], ascending=False)[:20]
nomfeature = pd.get_dummies(data[attributes].astype(str))
nomfeature.shape
nomfeature
nomfeature.to_csv('normal_feature_gainRate.csv', index=False)
data.SalePrice.to_csv('SalePrice.csv',index=False)
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
import math
import os
# 归一化处理
def normalize(df):
#df = (df - df.min()) / (df.max() - df.min())
for col in df.columns[:-1]:
df[col]= (df[col] - df[col].min()) / (df[col].max() - df[col].min())
return df
def load_data():
# 导入训练集
feature1Path = os.path.join("data", "numeric_feature.csv")
feature2Path = os.path.join("data", "normal_feature_gainRate.csv")
pricePath = os.path.join("data", "SalePrice.csv")
train_data = pd.concat([pd.read_csv(feature1Path), pd.read_csv(feature2Path)], axis=1)
df2 = pd.read_csv(pricePath, header=None)
# 数据归一化
train_data = normalize(train_data)
x = train_data.iloc[:, :-1] # 前89列为特征
y = df2 # 最后一列为标签
# 划分数据集(训练集占75%,测试集占25%)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.25, random_state = None)
return train_data, x_train,x_test, y_train, y_test
# 创建随机森林模型
def model(train_x,train_y):
my_model = RandomForestRegressor()
my_model.fit(train_x, train_y.values.flatten())
return my_model
train_data, train_x, test_x, train_y, test_y = load_data()
# 创建随机森林模型并进行训练和预测
my_model = model(train_x,train_y)
predicted_prices = my_model.predict(test_x)
#test_y_list = test_y['Label'].tolist()
# 均方误差作为评价指标
print("\n预测结果与样本的MAE为:")
print(math.log(mean_absolute_error(test_y,predicted_prices, multioutput='uniform_average')))
print("预测结果与样本的RMSE为")
print(math.log(np.sqrt(mean_squared_error(test_y,predicted_prices))))
# 保存预测结果
my_results = pd.DataFrame({'SalePrice': predicted_prices})
my_results.to_csv('PredictionResults.csv', index=False)
test_y.to_csv('PredictionSample.csv', index=False)
from sklearn import linear_model, svm, gaussian_process
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
df = pd.concat([pd.read_csv("data/numeric_feature.csv"), pd.read_csv("data/normal_feature_corr.csv")], axis=1)
for col in df.columns:
df[col] = (df[col] - df[col].min()) / (df[col].max() - df[col].min())
df2 = pd.read_csv("data/SalePrice.csv", header=None)
train_X, test_X, train_y, test_y = train_test_split(df, df2, test_size=0.25, random_state=42)
clf =linear_model.BayesianRidge()
clf.fit(train_X, train_y)
y_pred = clf.predict(test_X)
print("\n预测结果与样本的MAE为:")
print(np.log(mean_absolute_error(y_pred, test_y)))
print("预测结果与样本的RMSE为")
print(np.log(np.sqrt(mean_squared_error(test_y,y_pred))))
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
import math
import numpy as np
feature1Path = os.path.join("data", "numeric_feature.csv")
feature2Path = os.path.join("data", "normal_feature_corr.csv")
pricePath = os.path.join("data", "SalePrice.csv")
df = pd.concat([pd.read_csv(feature1Path), pd.read_csv(feature2Path)], axis=1)
for col in df.columns:
df[col] = (df[col] - df[col].min()) / (df[col].max() - df[col].min())
df2 = pd.read_csv(pricePath, header=None)
# 划分数据集
train_X, test_X, train_y, test_y = train_test_split(df, df2, test_size=0.25, random_state=None)
# 训练回归模型
xg = XGBRegressor( n_estimators=500, learning_rate=0.1, min_child_weight=5, max_depth=4)
xg.fit(train_X, train_y)
xg.score(test_X, test_y)
pre_y = xg.predict(test_X)
print("\n预测结果与样本的MAE为:")
print(math.log(mean_absolute_error(test_y,pre_y, multioutput='uniform_average')))
print("预测结果与样本的RMSE为")
print(math.log(np.sqrt(mean_squared_error(test_y,pre_y))))