1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93
| #encoding=utf8 import numpy as np import pandas as pd import matplotlib.pyplot as plt import seaborn as sns from datetime import datetime from xgboost import XGBRegressor from sklearn import metrics from sklearn import preprocessing from sklearn.preprocessing import StandardScaler from sklearn.model_selection import KFold from sklearn.model_selection import cross_val_score from sklearn.model_selection import train_test_split
pd.set_option('display.max_columns', None) pd.set_option('display.max_rows', None)
''' 1. 相关性分析 2. 异常数据监测 3. sns使用 '''
def output(model, X_test, test_datetime): y_pred = np.expm1(model.predict(X_test)) result = pd.DataFrame({'count':y_pred}) result = pd.concat([test_datetime, result], axis=1) result.to_csv('submission.csv', index=False)
def xgboost_regress(x_train, y_train, x_valid, y_valid): #model = XGBRegressor(max_depth = 4, n_estimators = 800,learning_rate = 0.1,subsample = 0.7,reg_alpha=0.1, reg_lambda=0.1,colsample_bytree = 0.7) #model = XGBRegressor(max_depth=6,n_estimators=1000,min_child_weight=300,colsample_bytree=0.8,subsample=0.8,eta=0.3,seed=42) model = XGBRegressor(max_depth=5, colsample_btree=0.1, learning_rate=0.1, subsample=0.7, n_estimators=400, min_child_weight=2);
#kfold = KFold(n_splits=3, random_state=7) #results = -cross_val_score(model, x_train, y_train, scoring='neg_mean_squared_error', cv=kfold) #print("Accuracy: %.2f (%.2f)" % (results.mean(), results.std())) #model.fit(x_train, y_train, eval_metric = 'rmse', eval_set=[(x_train, y_train), (x_valid, y_valid)], verbose=True, early_stopping_rounds = 100) #model.fit(x_train, y_train, eval_metric = 'rmse', eval_set=[(x_train, y_train), (x_valid, y_valid)], verbose=True) model.fit(X_train, y_train) return model;
train_data = pd.read_csv("./data/train.csv") train_data = train_data.drop(["casual","registered"], axis=1) train_data["count"] = np.log1p(train_data["count"])
test_data = pd.read_csv("./data/test.csv") test_data["count"] = 0 test_datetime = test_data["datetime"]
print(train_data.info()) print(test_data.info())
combined_data = train_data.append(test_data)
#count数据是有偏的使用log做正态化 #sns.distplot(train_data["count"]) #plt.show()
##使用boxplot可以检测异常点 #sns.boxplot(x=train_data.windspeed) #plt.show()
#查看两个变量的相关性 #sns.barplot('weather','count',data=train_data) #plt.show()
#sns.barplot('temp','count',data=train_data) #plt.show()
combined_data["date"] = combined_data.datetime.apply(lambda x : x.split()[0]) combined_data["hour"] = combined_data.datetime.apply(lambda x : x.split()[1].split(":")[0]).astype("int") combined_data["year"] = combined_data.datetime.apply(lambda x : x.split()[0].split("-")[0]).astype("int") combined_data["weekday"] = combined_data.date.apply(lambda dateString : datetime.strptime(dateString,"%Y-%m-%d").weekday()) combined_data["month"] = combined_data.date.apply(lambda dateString : datetime.strptime(dateString,"%Y-%m-%d").month) combined_data = combined_data.drop(["date", "datetime"], axis=1) print(combined_data.info())
train_data = combined_data[:10886] test_data = combined_data[10886:] print(test_data.head(10))
#print(train_data.corr())
X_train = train_data.drop(["count"], axis=1) Y_train = train_data["count"] #train_x, valid_x, train_y, valid_y = train_test_split(X_train, Y_train, test_size=0.2, random_state=1) #model = xgboost_regress(train_x, train_y, valid_x, valid_y ); model = xgboost_regress(X_train, Y_train, None, None);
X_test = test_data.drop(['count'], axis=1) output(model, X_test, test_datetime);
|