Kaggle:预测自行车租赁数量(Bike Sharing Demand)

该题目中主要的特征处理手段是将日期的字符串字段转换为相应的年份、月份、星期几、小时等字段，然后使用xgboost就可以进行预测，误差rmse为0.40左右。
#encoding=utf8
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
from xgboost import XGBRegressor
from sklearn import metrics
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

'''
1. 相关性分析
2. 异常数据监测
3. sns使用
'''

def output(model, X_test, test_datetime):
    y_pred = np.expm1(model.predict(X_test))
    result = pd.DataFrame({'count':y_pred})
    result = pd.concat([test_datetime, result], axis=1)
    result.to_csv('submission.csv', index=False)

def xgboost_regress(x_train, y_train, x_valid, y_valid):
    #model = XGBRegressor(max_depth = 4, n_estimators = 800,learning_rate = 0.1,subsample = 0.7,reg_alpha=0.1, reg_lambda=0.1,colsample_bytree = 0.7)
    #model = XGBRegressor(max_depth=6,n_estimators=1000,min_child_weight=300,colsample_bytree=0.8,subsample=0.8,eta=0.3,seed=42)
    model = XGBRegressor(max_depth=5, colsample_btree=0.1, learning_rate=0.1, subsample=0.7, n_estimators=400, min_child_weight=2);

    #kfold = KFold(n_splits=3, random_state=7)
    #results = -cross_val_score(model, x_train, y_train, scoring='neg_mean_squared_error', cv=kfold)
    #print("Accuracy: %.2f (%.2f)" % (results.mean(), results.std()))
    
    #model.fit(x_train, y_train, eval_metric = 'rmse', eval_set=[(x_train, y_train), (x_valid, y_valid)], verbose=True, early_stopping_rounds = 100)
    #model.fit(x_train, y_train, eval_metric = 'rmse', eval_set=[(x_train, y_train), (x_valid, y_valid)], verbose=True)
    model.fit(X_train, y_train)
    return model;

train_data = pd.read_csv("./data/train.csv")
train_data = train_data.drop(["casual","registered"], axis=1)
train_data["count"] = np.log1p(train_data["count"])

test_data = pd.read_csv("./data/test.csv")
test_data["count"] = 0
test_datetime = test_data["datetime"]

print(train_data.info()) 
print(test_data.info())

combined_data = train_data.append(test_data)

#count数据是有偏的使用log做正态化
#sns.distplot(train_data["count"])
#plt.show()

##使用boxplot可以检测异常点
#sns.boxplot(x=train_data.windspeed)
#plt.show()

#查看两个变量的相关性
#sns.barplot('weather','count',data=train_data)
#plt.show()

#sns.barplot('temp','count',data=train_data)
#plt.show()

combined_data["date"] = combined_data.datetime.apply(lambda x : x.split()[0])
combined_data["hour"] = combined_data.datetime.apply(lambda x : x.split()[1].split(":")[0]).astype("int")
combined_data["year"] = combined_data.datetime.apply(lambda x : x.split()[0].split("-")[0]).astype("int")
combined_data["weekday"] = combined_data.date.apply(lambda dateString : datetime.strptime(dateString,"%Y-%m-%d").weekday())
combined_data["month"] = combined_data.date.apply(lambda dateString : datetime.strptime(dateString,"%Y-%m-%d").month)
combined_data = combined_data.drop(["date", "datetime"], axis=1)
print(combined_data.info())

train_data = combined_data[:10886]
test_data = combined_data[10886:]
print(test_data.head(10))

#print(train_data.corr())

X_train = train_data.drop(["count"], axis=1)
Y_train = train_data["count"]
#train_x, valid_x, train_y, valid_y = train_test_split(X_train, Y_train, test_size=0.2, random_state=1)
#model = xgboost_regress(train_x, train_y, valid_x, valid_y ); 
model = xgboost_regress(X_train, Y_train, None, None); 

X_test = test_data.drop(['count'], axis=1)
output(model, X_test, test_datetime);