import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
%matplotlib inline
import matplotlib.pyplot as plt # Matlab-style plotting
import matplotlib.gridspec as gridspec
import matplotlib.style as style
import seaborn as sns
color = sns.color_palette()
sns.set_style('darkgrid')
from scipy import stats
from scipy.stats import norm, skew #for some statistics
from subprocess import check_output
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
import torch
from torch import nn, optim
from torch.utils.data import DataLoader
import torch.nn.functional as F
from torch.nn import init
import warnings
warnings.filterwarnings("ignore")
pd.set_option('display.float_format', lambda x: '{:.3f}'.format(x)) #Limiting floats output to 3 decimal points
path = "./data/house-prices-advanced-regression-techniques"
print(check_output(["ls", path]).decode("utf8")) #check the files available in the directory
train = pd.read_csv(path + '/train.csv')
test = pd.read_csv(path + '/test.csv')
train.drop('Id', axis=1, inplace=True)
test.drop('Id', axis=1, inplace=True)
print(train.shape)
print(test.shape)
train.head(5)
test.head()
def plotting_3_chart(df, feature):
## Creating a customized chart. and giving in figsize and everything.
fig = plt.figure(constrained_layout=True, figsize=(12,8))
## crea,ting a grid of 3 cols and 3 rows.
grid = gridspec.GridSpec(ncols=3, nrows=3, figure=fig)
#gs = fig3.add_gridspec(3, 3)
## Customizing the histogram grid.
ax1 = fig.add_subplot(grid[0, :2])
## Set the title.
ax1.set_title('Histogram')
## plot the histogram.
sns.distplot(df.loc[:,feature], norm_hist=True, ax = ax1)
# customizing the QQ_plot.
ax2 = fig.add_subplot(grid[1, :2])
## Set the title.
ax2.set_title('QQ_plot')
## Plotting the QQ_Plot.
stats.probplot(df.loc[:,feature], plot = ax2)
## Customizing the Box Plot.
ax3 = fig.add_subplot(grid[:, 2])
## Set title.
ax3.set_title('Box Plot')
## Plotting the box plot.
sns.boxplot(df.loc[:,feature], orient='v', ax = ax3 );
print('Skewness: '+ str(train['SalePrice'].skew()))
print("Kurtosis: " + str(train['SalePrice'].kurt()))
plotting_3_chart(train, 'SalePrice')
train["SalePrice"] = np.log1p(train["SalePrice"])
print('Skewness: '+ str(train['SalePrice'].skew()))
print("Kurtosis: " + str(train['SalePrice'].kurt()))
plotting_3_chart(train, 'SalePrice')
style.use('ggplot')
sns.set_style('whitegrid')
plt.subplots(figsize = (30,20))
## Plotting heatmap.
# Generate a mask for the upper triangle (taken from seaborn example gallery)
mask = np.zeros_like(train.corr(), dtype=np.bool)
mask[np.triu_indices_from(mask)] = True
sns.heatmap(train.corr(),
cmap=sns.diverging_palette(255, 133, l=60, n=7),
mask = mask,
annot=True,
center = 0,
);
## Give title.
plt.title("Heatmap of all the Features", fontsize = 30);
#scatter plot grlivarea/saleprice
var = 'GrLivArea'
data = pd.concat([train['SalePrice'], train[var]], axis=1)
data.plot.scatter(x=var, y='SalePrice');
#Deleting outliers
train = train.drop(train[(train['GrLivArea']>4000) & (train['SalePrice']<12.5)].index)
#Check the graphic again
fig, ax = plt.subplots()
ax.scatter(train['GrLivArea'], train['SalePrice'])
plt.ylabel('SalePrice')
plt.xlabel('GrLivArea')
plt.show()
#box plot overallqual/saleprice
var = 'OverallQual'
data = pd.concat([train['SalePrice'], train[var]], axis=1)
f, ax = plt.subplots(figsize=(8, 6))
fig = sns.boxplot(x=var, y="SalePrice", data=data)
#scatterplot
sns.set()
cols = ['SalePrice', 'OverallQual', 'GrLivArea', 'GarageCars', 'TotalBsmtSF', 'FullBath', 'YearBuilt']
sns.pairplot(train[cols], height = 2.5)
plt.show();
print(train.shape)
dataset = pd.concat(objs=[train, test], axis=0,sort=False).reset_index(drop=True)
dataset.head()
total = dataset.isnull().sum().sort_values(ascending=False)
total.drop("SalePrice",inplace=True)
percent = (dataset.isnull().sum()/dataset.isnull().count()).sort_values(ascending=False)
percent.drop("SalePrice",inplace=True)
missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing_data.head(25)
remove_columns=percent[percent>0.002]
columns=pd.DataFrame(remove_columns)
print("我们会舍弃下列特征,因为它们的缺失率高于 "+str(0.002*100)+"%: ")
print(remove_columns)
dataset=dataset.drop(columns.index,axis=1)
cat=dataset.select_dtypes("object")
for column in cat:
dataset[column].fillna(dataset[column].mode()[0], inplace=True)
fl=dataset.select_dtypes(["float64","int64"]).drop("SalePrice",axis=1)
for column in fl:
dataset[column].fillna(dataset[column].median(), inplace=True)
print(dataset.shape)
dataset.drop("SalePrice",axis=1).isnull().values.any()
dataset.to_csv('dataset.csv', index=False)
dataset = pd.read_csv('dataset.csv')
dataset.head()
dataset = pd.get_dummies(dataset, dummy_na=True, drop_first=True)
sale_price = dataset['SalePrice']
scaler = MinMaxScaler()
dataset = pd.DataFrame(scaler.fit_transform(dataset), columns = dataset.columns)
dataset['SalePrice'] = sale_price
dataset.head()
pytrain=dataset[dataset["SalePrice"].notnull()].copy()
pytest=dataset[dataset["SalePrice"].isna()].copy()
print(pytrain.shape)
print(pytest.shape)
pytest.drop('SalePrice', axis=1, inplace=True)
X_train, X_val, y_train, y_val = train_test_split(pytrain.drop('SalePrice', axis=1), pytrain['SalePrice'], test_size=0.2, random_state=42)
class MLP(nn.Module):
def __init__(self):
super().__init__()
self.fc1 = nn.Linear(219, 144)
self.fc2 = nn.Linear(144, 72)
self.fc3 = nn.Linear(72, 36)
self.fc4 = nn.Linear(36, 18)
self.fc5 = nn.Linear(18, 1)
def forward(self, x):
x = F.relu(self.fc1(x))
x = F.relu(self.fc2(x))
x = F.relu(self.fc3(x))
x = F.relu(self.fc4(x))
x = F.relu(self.fc5(x))
return x
def adjust_learning_rate(epoch, learning_rate, lr_decay_epochs, optimizer):
steps = np.sum(epoch > np.asarray(lr_decay_epochs))
if steps > 0:
new_lr = learning_rate * (0.1 ** steps)
for param_group in optimizer.param_groups:
param_group['lr'] = new_lr
def pred_by_nnet(X_train, X_val, y_train, y_val, model):
y_train = np.expm1(y_train)
y_val = np.expm1(y_val)
train_batch = np.array_split(X_train, 50)
label_batch = np.array_split(y_train, 50)
for i in range(len(train_batch)):
train_batch[i] = torch.from_numpy(train_batch[i].values).float()
for i in range(len(label_batch)):
label_batch[i] = torch.from_numpy(label_batch[i].values).float().view(-1, 1)
X_val = torch.from_numpy(X_val.values).float()
y_val = torch.from_numpy(y_val.values).float().view(-1, 1)
ps = model(train_batch[0])
print(ps.shape)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
lr_decay_epochs = [180, 200]
epochs = 220
train_losses, test_losses = [], []
model.train()
for e in range(epochs):
train_loss = 0
adjust_learning_rate(e, 0.001, lr_decay_epochs, optimizer)
for i in range(len(train_batch)):
optimizer.zero_grad()
output = model(train_batch[i])
loss = torch.sqrt(criterion(torch.log(output), torch.log(label_batch[i])))
loss.backward()
optimizer.step()
train_loss += loss.item()
else:
test_loss = 0
accuracy = 0
with torch.no_grad():
model.eval()
predictions = model(X_val)
test_loss += torch.sqrt(criterion(torch.log(predictions), torch.log(y_val)))
train_losses.append(train_loss/len(train_batch))
test_losses.append(test_loss)
print("Epoch: {}/{}.. ".format(e+1, epochs),
"Training Loss: {:.3f}.. ".format(train_loss/len(train_batch)),
"Test Loss: {:.3f}.. ".format(test_loss))
return train_losses, test_losses
model = MLP()
nntrain_losses, nntest_losses = pred_by_nnet(X_train, X_val, y_train, y_val, model)
pytest = torch.from_numpy(pytest.values).float()
with torch.no_grad():
model.eval()
output = model.forward(pytest)
output.shape
np.random.seed(seed=42)
from sklearn.metrics import mean_squared_error,mean_absolute_error
from sklearn.ensemble import GradientBoostingRegressor,RandomForestRegressor,AdaBoostRegressor,ExtraTreesRegressor
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from sklearn.linear_model import Ridge,RidgeCV,BayesianRidge,LinearRegression,Lasso,LassoCV,ElasticNet,RANSACRegressor,HuberRegressor,PassiveAggressiveRegressor,ElasticNetCV
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import VotingRegressor
from sklearn.svm import SVR
from sklearn.kernel_ridge import KernelRidge
r_s = 42
my_regressors=[
ElasticNet(alpha=0.001,l1_ratio=0.70,max_iter=100,tol=0.01, random_state=r_s),
ElasticNetCV(l1_ratio=0.9,max_iter=100,tol=0.01,random_state=r_s),
Lasso(alpha=0.00047,random_state=r_s),
LassoCV(),
AdaBoostRegressor(random_state=r_s),
GradientBoostingRegressor(n_estimators=3000, learning_rate=0.05, max_depth=4, max_features='sqrt',
min_samples_leaf=15, min_samples_split=10, loss='huber',random_state =r_s),
XGBRegressor(random_state=r_s),
LGBMRegressor(objective='regression', num_leaves=4,learning_rate=0.01, n_estimators=5000,max_bin=200,
bagging_fraction=0.75,bagging_freq=5,bagging_seed=7,feature_fraction=0.2,feature_fraction_seed=7,
verbose=-1,random_state=r_s),
RandomForestRegressor(random_state=r_s),
ExtraTreesRegressor(random_state=r_s),
SVR(C= 20, epsilon= 0.008, gamma=0.0003),
DecisionTreeRegressor(),
Ridge(alpha=6),
RidgeCV(),
BayesianRidge(),
KernelRidge(),
KNeighborsRegressor(),
HuberRegressor(),
PassiveAggressiveRegressor(random_state=r_s),
]
regressors=[]
for my_regressor in my_regressors:
regressors.append(my_regressor)
scores_val=[]
scores_train=[]
RMSE=[]
for regressor in regressors:
scores_val.append(regressor.fit(X_train,y_train).score(X_val,y_val))
scores_train.append(regressor.fit(X_train,y_train).score(X_train,y_train))
y_pred=regressor.predict(X_val)
RMSE.append(np.sqrt(mean_squared_error(np.log(np.expm1(y_val)),np.log(np.expm1(y_pred)))))
results=zip(scores_val,scores_train,RMSE)
results=list(results)
results_score_val=[item[0] for item in results]
results_score_train=[item[1] for item in results]
results_RMSE=[item[2] for item in results]
df_results=pd.DataFrame({"Algorithms":my_regressors,"Training Score":results_score_train,"Validation Score":results_score_val,"RMSE":results_RMSE})
df_results
best_models=df_results.sort_values(by="RMSE")
best_model=best_models.iloc[0][0]
best_stack=best_models["Algorithms"].values
best_models
print(best_model)
best_model.fit(pytrain.drop('SalePrice', axis=1), pytrain['SalePrice'])
y_test=best_model.predict(pytest)
submission = pd.read_csv('data/house-prices-advanced-regression-techniques/sample_submission.csv')
submission['SalePrice'] = np.expm1(y_test)
submission.to_csv('submission.csv', index=False)