一、场景介绍像“V站”APP,可在即时盘或滚动盘中进行大小球的预测,利用机器学习预测大小球的概率。二、数据说明数据主要有两方面:即时盘或滚动盘的实时赔率数据、整场比赛的离线特征及衍生特征。备注:① 赔率数据的缺失比例跟不同博彩不同场景下的开盘有关,一般赔率数据存储所占内存较大。常用的一般是bet365,皇冠,立博,易胜博,韦德,威廉希尔等。② 整场比赛的训练特征最好做特征衍生,在预测时比赛的整场原始特征是没有的。三、模型说明对实时赔率特征、实时赔率+离线特征分别进行了lstm、bilstm和xgb的模型效果对比。四、代码说明4.1 sql提取数据(1) 实时赔率特征提取-- 主盘口-即时盘-【bet365】-亚盘/欧盘/大小球
select concat('A', '_', c.match_id) as match_id,
c.company_id,
c.handicap_type,
c.odds_1,
c.odds_2,
c.odds_3
from (
select b.*,
row_number() over(partition by b.match_id, b.company_id, b.handicap_type order by b.odds_date desc) as rk
from (
select a.match_id,
a.company_id,
a.handicap_type, -- bet365的欧盘即时盘
split(regexp_replace(regexp_replace(a.odds, '\\}', ''), '\\{', ''),',')[0] as odds_1,
split(regexp_replace(regexp_replace(a.odds, '\\}', ''), '\\{', ''),',')[1] as odds_2,
split(regexp_replace(regexp_replace(a.odds, '\\}', ''), '\\{', ''),',')[2] as odds_3,
a.odds_date
from football.ft_t_odds a
where a.handicap_num=1 -- 主盘口
and a.tag='即' -- 即时盘
and a.company_id in (2, 3, 5, 9, 10, 11) -- 2: BET365(英国); 3: 皇冠; 5:立博 ; 9:威廉希尔 ; 10:易胜博 ; 11:韦德
and a.handicap_type in (1, 2, 3) -- 1:亚盘; 2:欧盘; 3:大小球
and a.odds is not null
and a.odds not like '%封%'
) b
) c
where c.rk=1
; (2) 离线特征及标签提取-- 亚盘盘口汇总
select distinct
split_part(split_part(split_part(cast(finally_asia as varchar), '}', 1), '{', 2), ',', 2) AS asia_plate
from public.ft_t_match
;
-- 每场比赛的标签提取【20170101-20200930】
select c.match_id,
case when c.home_score=c.away_score then 2
when c.home_score>c.away_score then 1
when c.home_score<c.away_score then 0
end as european_label, -- 欧盘-胜平负: 0-主队负, 1: 主队胜, 2: 平局
case when (c.home_score+c.away_score)=c.bigsmall_ball_hancidap then 2
when (c.home_score+c.away_score)>c.bigsmall_ball_hancidap then 1
when (c.home_score+c.away_score)<c.bigsmall_ball_hancidap then 0
end as bigsmall_ball_label, -- 大小球: 0-小球, 1: 大球, 2: 平局
case when mod(c.bigsmall_ball_hancidap/0.5, 2)=0 then 0
else 1 end as bigsmall_ball_handicap_label, -- 大小球盘口标志 (是否是.5形式,0否1是)
case when c.concede_points>=0 and (c.home_score-c.concede_points)>c.away_score then 1
when c.concede_points>=0 and (c.home_score-c.concede_points)<c.away_score then 0
when c.concede_points<0 and (c.away_score+c.concede_points)>c.home_score then 1
when c.concede_points<0 and (c.away_score+c.concede_points)<c.home_score then 0
else 2 end as upperlower_plate_label, -- 上下盘:0-下盘,1-上盘,2-走盘
abs(c.concede_points) as concede_points_abs, -- 让球数(取绝对值)
case when c.concede_points>=0 then 1 else 0 end as home_concede_label, -- 主队让球标志
case when c.concede_points<0 then 1 else 0 end as away_concede_label -- 客队让球标志
from (
select b.*,
cast(split_part(b.finally_goal_handicap, '/', 1) as numeric) as bigsmall_ball_hancidap,
case when b.asia_plate='平手/半球' then 0.25
when b.asia_plate='平手' then 0
when b.asia_plate='半球' then 0.5
when b.asia_plate='受让平手/半球' then -0.25
when b.asia_plate='半球/一球' then 0.75
when b.asia_plate='一球' then 1.0
when b.asia_plate='受让半球' then -0.5
when b.asia_plate='一球/球半' then 1.25
when b.asia_plate='受让半球/一球' then -0.75
when b.asia_plate='球半' then 1.5
when b.asia_plate='受让一球' then -1.0
when b.asia_plate='球半/两球' then 1.75
when b.asia_plate='受让一球/球半' then -1.25
when b.asia_plate='两球' then 2.0
when b.asia_plate='受让球半' then -1.5
when b.asia_plate='两球/两球半' then 2.25
when b.asia_plate='受让球半/两球' then -1.75
when b.asia_plate='受让两球' then -2.0
when b.asia_plate='两球半' then 2.5
when b.asia_plate='受让两球/两球半' then -2.25
when b.asia_plate='两球半/三球' then 2.75
when b.asia_plate='受让两球半' then -2.5
when b.asia_plate='三球' then 3.0
when b.asia_plate='受让两球半/三球' then -2.75
when b.asia_plate='受让三球' then -3.0
when b.asia_plate='三球/三球半' then 3.25
when b.asia_plate='三球半' then 3.5
when b.asia_plate='三球半/四球' then 3.75
when b.asia_plate='受让三球/三球半' then -3.25
when b.asia_plate='受让三球半' then -3.5
when b.asia_plate='四球' then 4.0
when b.asia_plate='受让三球半/四球' then -3.75
when b.asia_plate='受让四球' then -4.0
when b.asia_plate='四球半' then 4.5
when b.asia_plate='四球/四球半' then 4.25
when b.asia_plate='受让四球半' then -4.5
when b.asia_plate='四球半/五球' then 4.75
when b.asia_plate='受让四球/四球半' then -4.25
when b.asia_plate='五球' then 5.0
when b.asia_plate='受让四球半/五球' then -4.75
when b.asia_plate='受让五球' then -5.0
when b.asia_plate='五球/五球半' then 5.25
when b.asia_plate='受让五球半' then -5.5
when b.asia_plate='五球半' then 5.5
when b.asia_plate='五球半/六球' then 5.75
when b.asia_plate='受让五球/五球半' then -5.25
when b.asia_plate='受让五球半/六球' then -5.75
when b.asia_plate='六球' then 6.0
when b.asia_plate='受让六球' then -6.0
when b.asia_plate='受让六球半' then -6.5
when b.asia_plate='受让六球/六球半' then -6.25
when b.asia_plate='六球半' then 6.5
when b.asia_plate='六球/六球半' then 6.25
when b.asia_plate='受让七球' then -7.0
when b.asia_plate='受让七球/七球半' then -7.25
when b.asia_plate='七球' then 7.0
when b.asia_plate='六球半/七球' then 6.75
when b.asia_plate='七球半' then 7.5
when b.asia_plate='七球/七球半' then 7.25
when b.asia_plate='受让六球半/七球' then -6.75
when b.asia_plate='受让七球半' then -7.5
when b.asia_plate='受让七球半/八球' then -7.75
when b.asia_plate='八球半' then 8.5
when b.asia_plate='七球半/八球' then 7.75
when b.asia_plate='八球半/九球' then 8.75
when b.asia_plate='受让八球' then -8.0
when b.asia_plate='九球半/十球' then 9.75
when b.asia_plate='九球/九球半' then 9.25
when b.asia_plate='九球' then 9.0
when b.asia_plate='受让九球' then -9.0
when b.asia_plate='受让九球/九球半' then -9.25
when b.asia_plate='八球/八球半' then 8.25
when b.asia_plate='八球' then 8.0
when b.asia_plate='-13' then -13.0
when b.asia_plate='10.75' then 10.75
when b.asia_plate='受让八球半' then -8.5
when b.asia_plate='十球' then 10
when b.asia_plate='11.75' then 11.75
when b.asia_plate='九球半' then 9.5
when b.asia_plate='受让九球半' then -9.5
when b.asia_plate='受让八球半/九球' then -8.75
when b.asia_plate='-11.5' then -11.5
when b.asia_plate='受让九球半/十球' then -9.75
when b.asia_plate='10.5' then 10.5
when b.asia_plate='10.5' then 10.5
when b.asia_plate='受让八球/八球半' then -8.25
when b.asia_plate='受让十球' then -10
when b.asia_plate='11.5' then 11.5
when b.asia_plate='18.5' then 18.5
when b.asia_plate='15.5' then 15.5
when b.asia_plate='-10.5' then -10.5
when b.asia_plate='12.5' then 12.5
when b.asia_plate='14.5' then 14.5
else null end as concede_points
from (
select concat('A_', a.id) as match_id,
a.home_score,
a.away_score,
a.home_corner,
a.away_corner,
split_part(split_part(split_part(cast(a.finally_asia as varchar), '}', 1), '{', 2), ',', 2) AS asia_plate, -- 最终亚盘盘口,
split_part(split_part(split_part(cast(a.finally_goal as varchar), '}', 1), '{', 2), ',', 2) as finally_goal_handicap -- 最终大小球盘口
from ft_t_match a
where cast(a.match_time as date) BETWEEN '2017-01-01' AND '2020-09-30'
and a.has_score_line='1'
and a.home_score is not null
and a.away_score is not null
and a.finally_asia is not null
and a.finally_goal is not null
and cast(a.finally_asia as varchar) not like '%封%'
and cast(a.finally_goal as varchar) not like '%封%'
) b
) c
;
-- 离线特征提取【20170101-20200930】
select concat('A', '_', a.id) as match_id,
case when a.away_half_possession is null then null else cast(split_part(a.away_half_possession, '%', 1) as float)/100 end AS away_half_possession, -- 客队半场控球率 ,
case when a.home_half_possession is null then null else cast(split_part(a.home_half_possession, '%', 1) as float)/100 end AS home_half_possession, -- 主队半场控球率 ,
case when a.away_possession is null then null else cast(split_part(a.away_possession, '%', 1) as float)/100 end AS away_possession, -- 客队全场控球率 ,
case when a.home_possession is null then null else cast(split_part(a.home_possession, '%', 1) as float)/100 end AS home_possession, -- 主队全场控球率 ,
a.home_attack,
a.away_attack,
a.home_dangerous_attack,
a.away_dangerous_attack,
case when a.home_attack is null or a.home_attack=0 then null else round(a.home_dangerous_attack/a.home_attack, 2) end AS home_danger_rate, -- 主队危险进攻率,
case when a.away_attack is null or a.away_attack=0 then null else round(a.away_dangerous_attack/a.away_attack, 2) end AS away_danger_rate, -- 客队危险进攻率,
a.home_red,
a.away_red,
a.home_yellow,
a.away_yellow,
a.home_shoot,
a.away_shoot,
a.away_shoot_on,
a.home_shoot_on,
a.home_shoot_out,
a.away_shoot_out,
case when a.home_shoot is null or a.home_shoot=0 then null else round(a.home_shoot_on/a.home_shoot, 2) end AS home_shoot_rate, -- 主队射正率,
case when a.away_shoot is null or a.away_shoot=0 then null else round(a.away_shoot_on/a.away_shoot, 2) end AS away_shoot_rate, -- 客队射正率,
a.away_corner,
a.home_corner,
a.away_half_corner,
a.home_half_corner,
a.home_shoot_stop,
a.away_shoot_stop,
a.home_pass,
a.away_pass,
a.home_pass_success,
a.away_pass_success,
a.home_foul,
a.away_foul,
a.home_head,
a.away_head,
a.home_head_success,
a.away_head_success,
a.home_follow,
a.away_follow,
a.home_slide,
a.away_slide,
a.home_past,
a.away_past,
a.home_out,
a.away_out,
a.home_on_door,
a.away_on_door,
a.home_steals,
a.away_steals,
a.home_assists,
a.away_assists,
a.home_first,
a.away_first,
a.home_intercept,
a.away_intercept
from ft_t_match a
where cast(a.match_time as date) BETWEEN '2017-01-01' AND '2020-09-30'
and a.has_score_line='1'
and a.home_score is not null
and a.away_score is not null
and a.finally_asia is not null
and a.finally_goal is not null
and cast(a.finally_asia as varchar) not like '%封%'
and cast(a.finally_goal as varchar) not like '%封%'
;4.2 数据整合将sql提取的数据整合成三个文件:比赛标签(match_label_data.csv)、实时赔率数据(odds_features.csv)、离线特征(offlinefeatures.csv),数据之间根据match_id进行关联。4.3 模型训练 因为最终模型选择了xgb,只做xgb的模型训练展示。#!/usr/bin/env python
# coding: utf-8
import os
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings("ignore"), color_codes=True) # 设置绘图风格
mpl.rcParams['font.family'] = ['sans-serif']
mpl.rcParams['font.sans-serif'] = ['SimHei'] # 正常显示中文标签
mpl.rcParams['axes.unicode_minus'] = False
plt.rcParams['figure.figsize'] = (15.0, 10.0) # 设置图形大小
plt.rcParams['savefig.dpi'] = 200 # 图片像素
plt.rcParams['figure.dpi'] = 200 # 分辨率
# # 数据读取
# ## 比赛标签数据
match_label_data = pd.read_csv("./data/final_dataset/match_label_data.csv",)
print(match_label_data.shape)
print(match_label_data.head())
print(match_label_data.dtypes)
# ## 比赛离线特征
# 不含标签列
match_offline_features = pd.read_csv("./data/final_dataset/offline_features.csv",)
print(match_offline_features.shape)
print(match_offline_features.head())
print(match_offline_features.dtypes)
# ## 比赛最终滚盘赔率 [皇冠]
hg_instant_feature = pd.read_csv("./data/final_dataset/hg_instant_feature_data.csv",)
print(hg_instant_feature.shape)
print(hg_instant_feature.head())
# ## 实时赔率特征
odds_features = pd.read_csv("./data/final_dataset/odds_features.csv",)
print(odds_features.shape)
print(odds_features.head())
print(odds_features.dtypes)
# bet365赔率特征提取
feature_idx = odds_features.columns.str.contains("^bet365.+", regex=True)
# print(feature_idx)
bet365_features = ["match_id"],),)
match_all_data,)
print(match_all_data.shape)
print(match_all_data.dtypes)
# 标签分布统计
match_all_data.groupby(["bigsmall_ball_label"])["match_id"].count()
# 剔除平局
match_binary_data = match_all_data[match_all_data["bigsmall_ball_label"] != 2]
match_binary_data.drop(["european_label", "upperlower_asia_label"], axis=1, inplace=True)
match_binary_data.groupby(["bigsmall_ball_label"])["match_id"].count()
# # 数据探索性分析
# ## 缺失率统计
# 数值型变量统计
match_binary_numeric_stat = match_binary_data.select_dtypes(include=["float64", "int64"]).describe().T, ascending=False)
print(match_binary_numeric_stat.head())
# match_binary_numeric_stat.to_csv("./feature_statistic/match_binary_numeric_missing_stat.csv",) # 结果保存
missing_pct_threshold = 0.3 # 缺失阈值
# 缺失率小于0.3的数值型特征进行统计
numeric_list = match_binary_numeric_stat[match_binary_numeric_stat["missing_pct"] < missing_pct_threshold].index.tolist()
print("因子型特征保留缺失率低于{0}的特征有{1}个。".format(missing_pct_threshold, len(numeric_list)))
print(numeric_list)
# 赛前预测(利用bet365特征)
bet365_features.remove("match_id")
numeric_list):
"""
df: 数据集
feature_list: 特征列表
target: 统计目标
feature_type: 特征类型。默认为numeric.
若为因子型特征,取object:按照占比降序排列;
若为数字型特征,取numeric(float64 & int64):按照特征值升序排列。
"""
# 创建一个空的数据框
return_stat = pd.DataFrame(columns = ["value", "count", "pct", "feature"])
for col in feature_list:
if col == target:
continue
else:
N = len(df)
col_stat = pd.DataFrame(df.groupby(col)[target].count())
col_stat.reset_index(level=0, inplace=True)
col_stat.rename(columns={col: "value", target: "count"}, inplace=True)
n_value = len(col_stat)
col_stat["pct"] = col_stat.apply(lambda x: x[1] / N, axis=1)
col_stat["feature"] = [col for i in range(n_value)]
# print(col_stat)
if feature_type == "object":
, axis=0, ascending=False)
# col_stat = col_stat.sort(columns=["pct"], axis=0, ascending=False)
else:
col_stat = col_stat.sort_values("value",,))
# numeric_feature_stat.to_csv("./feature_statistic/offline_numeric_feature_pct_stat.csv",, index=False)
# ### 特征分布可视化
# 连续特征:直方图、分组直方图;核密度图、小提琴图(分位数) 、分组箱线图
def continuous_feature_plot(df, hist_feature_list, n_bins=50, fontsize=14, target=None):
"""
连续特征的直方图和核密度图。若target不为空,同时展示分组直方图和分组箱线图.
hist_feature_list: 连续特征列表.
n_bins: 直方图分多少箱, 默认50箱.
fontsize: 字体大小,默认为14.
target: 目标变量,当前固定为2个(0: 好用户,1:坏用户).
"""
for col in hist_feature_list:
print("连续特征:",)
# 直方图
plt.subplot(221)
plt.tight_layout()
sbn.distplot(df[col])
plt.xlabel(col, fontdict={'weight':'normal', 'size': fontsize})
plt.title("{col}--直方图".format(col=col),,,)
plt.xlabel(col, fontdict={'weight':'normal', 'size': fontsize})
plt.title("{col}--小提琴图".format(col=col), fontdict={'weight':'normal', 'size': fontsize})
print("进行分组可视化......")
unique_vals = df[target].unique().tolist()
unique_val0 = df[df[target] == unique_vals[0]]
unique_val1 = df[df[target] == unique_vals[1]]
# unique_val2 = df[df[target] == unique_vals[2]]
# 分组直方图
plt.subplot(223)
plt.tight_layout()
sbn.distplot(unique_val0[col], bins=n_bins, kde=False, norm_hist=True, color='steelblue', label=str(unique_vals[0]))
sbn.distplot(unique_val1[col], bins=n_bins, kde=False, norm_hist=True, color='purple', label=str(unique_vals[1]))
# sns.distplot(unique_val2[col], bins=n_bins, kde=False, norm_hist=True, color='pink', label=str(unique_vals[2]))
plt.xlabel(col, fontdict={'weight':'normal', 'size': fontsize})
plt.legend()
plt.title("{col}--分组直方图".format(col=col), fontdict={'weight':'normal', 'size': fontsize})
# 分组核密度图
plt.subplot(224)
plt.tight_layout()
sbn.distplot(unique_val0[col], hist=False, kde_kws={"color":"red", "linestyle":"-"}, norm_hist=True, label=str(unique_vals[0]))
sbn.distplot(unique_val1[col], hist=False, kde_kws={"color":"black", "linestyle":"--"}, norm_hist=True, label=str(unique_vals[1]))
# sns.distplot(unique_val2[col], hist=False, kde_kws={"color":"green", "linestyle":"-."}, norm_hist=True, label=str(unique_vals[2]))
plt.xlabel(col, fontdict={'weight':'normal', 'size': fontsize})
plt.legend()
plt.title("{col}--分组核密度图".format(col=col), fontdict={'weight':'normal', 'size': fontsize})
"""
分组箱线图
"""
# plt.subplot(222)
# plt.tight_layout()
# sns.boxplot(x=[unique_val0[col], unique_val1[col]], labels=[unique_vals[0], unique_vals[1]])
# plt.xlabel(col, fontdict={'weight':'normal', 'size': fontsize})
# plt.title("{col}特征的分组箱线图".format(col=col),)
# 直方图
plt.subplot(121)
plt.tight_layout()
sbn.distplot(df[col])
plt.xlabel(col, fontdict={'weight':'normal', 'size': fontsize})
plt.title("{col}--直方图".format(col=col),,,)
plt.xlabel(col, fontdict={'weight':'normal', 'size': fontsize})
plt.title("{col}--小提琴图".format(col=col), fontdict={'weight':'normal', 'size': fontsize})
plt.savefig("{col}--直方图&箱线图.png".format(col=col))
为目标变量进行可视化
def continuous_visualize(df, hist_feature_list, n_bins=50, fontsize=14, target=None, exclude_cols=None):
for col in hist_feature_list:
if (col != target) & (col not in exclude_cols):
mid_data = df[df[col] != -999]
continuous_feature_plot(mid_data, [col], n_bins=n_bins, fontsize=fontsize, target=target)
exclude_cols=["away_danger_rate", "home_danger_rate", "away_shoot_rate", "home_shoot_rate", "away_red", "home_red"], exclude_cols=exclude_cols)
# 漏网之鱼
# continuous_visualize(football_copy, ["away_danger_rate"],)
# ## 二分类
# ### 数据提取
bigsmall_ball_binary_xgb = match_binary_data[numeric_list]
print(bigsmall_ball_binary_xgb.shape)
bigsmall_ball_binary_xgb.dtypes
# ### 划分数据集
# binary classificationl for xgboost model training
# bigsmall_ball_binary_xgb.drop("match_id", axis=1, inplace=True)
X0 = bigsmall_ball_binary_xgb.drop("bigsmall_ball_label", axis=1)
y0 = bigsmall_ball_binary_xgb["bigsmall_ball_label"]
# 划分训练集、测试集和验证集
X, X_verify, y, y_verify = train_test_split(X0, y0, train_size=0.95, random_state=1234)
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.75, random_state=1234)
print("The size of train set:", X_train.shape)
print("The size of test set:", X_test.shape)
print("The size of verify set:",, useTrainCV=True, cv_folds=5, early_stopping_rounds=50):
"""
Define a function for modeling and cross-validation. This function will do the following:
① fit the model
② determine training accuracy
③ determine training AUC
④ determine testing AUC
⑤ update n_estimators with cv function of xgboost package
⑥ plot Feature Importance
⑦ Record the values of each feature under different measures of feature importance
:param alg: model
:param X_train: train set
:param y_train: train label
:param X_test: test set
:param y_test: test label
:param useTrainCV: Whether choose cross validation
:param cv_folds: The number of CV folds
:param early_stopping_rounds: The rounds of early stopping
:return:
Feature importance is stored in the current folder
"""
if useTrainCV:
xgb_param = alg.get_xgb_params()
dtrain = xgb.DMatrix(X_train, label=y_train)
cvResult = xgb.cv(xgb_param,
dtrain,
num_boost_round=alg.get_params()['n_estimators'],
nfold=cv_folds,
# metrics='merror', # mlogloss
metrics=evaluate_metric,
early_stopping_rounds=early_stopping_rounds)
print(cvResult.tail(10))
alg.set_params(n_estimators=cvResult.shape[0])
# fit model
# alg.fit(X_train, y_train, eval_metric='merror') # mlogloss
alg.fit(X_train, y_train, eval_metric=evaluate_metric)
# Predict training set
dtrainPred = alg.predict(X_train)
dtrainPredprob = alg.predict_proba(X_train)[:, 1]
# Print model report
print("\nModel Report")
print("Accuracy : {}".format(metrics.accuracy_score(y_train, dtrainPred)))
print("AUC Score (Train): {}".format(metrics.roc_auc_score(y_train, dtrainPredprob)))
# Predict on testing data:
testPredprob = alg.predict_proba(X_test)[:,1]
print("AUC Score (Test): {}".format(metrics.roc_auc_score(y_test, testPredprob)))
featImportStat = pd.DataFrame(X_train.columns, columns=["feature"])
# 特征重要性
for importType in ['weight', 'total_gain', 'total_cover', 'gain', 'cover']:
print("importType: {0}".format(importType))
plot_importance(alg, importance_type=importType)
plt.savefig(importType + ".png")
, columns=[importType])
importResult.reset_index(col_level=0, inplace=True)
importResult.rename(columns = {"index": "feature"}, inplace=True)
featImportStat = featImportStat.merge(importResult, on=["feature"],)
featImportStat.to_csv("./feature_importance/feature_importance_binary_xgb.csv",,,
),
nthread=4, # 运行的线程数
learning_rate=0.1, # 学习率,默认0.3
gamma=0, # 节点分裂所需的最小损失函数下降值,一般在0.01-0.2之间
max_depth=6, # 每棵树的最大深度,默认为6
min_child_weight=1, # 最小叶节点的样本权重和
max_delta_step=0, # 限制每棵树权重改变的最大步长,默认为0
n_estimators=200, # 迭代轮次
subsample=0.8, # 控制对每棵树随机采样的比例
colsample_bytree=0.8, # 列采样
# reg_lambda=1, # 控制模型复杂度的权重值的L2正则化项参数,参数越大,模型越不容易过拟合。
silent=0, # 输出运行信息
objective='binary:logistic', # 二分类
# num_class=3, # 类别数
seed=1234),
param_grid=param_test1,
scoring='roc_auc',
n_jobs=4,
iid=False,
cv=5)
gridsearch1.fit(X_train, y_train)
print(gridsearch1.cv_results_["mean_test_score"]),
nthread=4, # 运行的线程数
learning_rate=0.1, # 学习率,默认0.3
gamma=0, # 节点分裂所需的最小损失函数下降值,一般在0.01-0.2之间
max_depth=4, # 每棵树的最大深度,默认为6
min_child_weight=7, # 最小叶节点的样本权重和
max_delta_step=0, # 限制每棵树权重改变的最大步长,默认为0
n_estimators=200, # 迭代轮次
subsample=0.8, # 控制对每棵树随机采样的比例
colsample_bytree=0.8, # 列采样
# reg_lambda=1, # 控制模型复杂度的权重值的L2正则化项参数,参数越大,模型越不容易过拟合。
silent=0, # 输出运行信息
objective='binary:logistic', # 二分类
# num_class=3, # 类别数
seed=1234),
param_grid=param_test2,
scoring='roc_auc',
n_jobs=4,
iid=False,
cv=5)
gridsearch2.fit(X_train, y_train)
print(gridsearch2.cv_results_["mean_test_score"]),
nthread=4, # 运行的线程数
learning_rate=0.1, # 学习率,默认0.3
gamma=0, # 节点分裂所需的最小损失函数下降值,一般在0.01-0.2之间
max_depth=4, # 每棵树的最大深度,默认为6
min_child_weight=7, # 最小叶节点的样本权重和
max_delta_step=0, # 限制每棵树权重改变的最大步长,默认为0
n_estimators=200, # 迭代轮次
subsample=0.8, # 控制对每棵树随机采样的比例
colsample_bytree=0.8, # 列采样
# reg_lambda=1, # 控制模型复杂度的权重值的L2正则化项参数,参数越大,模型越不容易过拟合。
silent=0, # 输出运行信息
objective='binary:logistic', # 二分类
# num_class=3, # 类别数
seed=1234),
param_grid=param_test3,
scoring='roc_auc',
n_jobs=4,
iid=False,
cv=5)
gridsearch3.fit(X_train, y_train)
print(gridsearch3.cv_results_["mean_test_score"]) # 不同参数下交叉验证结果
print(gridsearch3.best_params_)
print(gridsearch3.best_score_)
# #### subsample和colsample_bytree调优
param_test4 = {
'subsample': [i/10.0 for i in range(6, 10)],
'colsample_bytree': [i/10.0 for i in range(6, 10)]
}
gridsearch4 = GridSearchCV(estimator = XGBClassifier(
learning_rate=0.1,
n_estimators=200,
max_depth=6,
min_child_weight=1,
gamma=0.2,
subsample=0.8,
colsample_bytree=0.8,
objective='binary:logistic',
nthread=4,
scale_pos_weight=1,
seed=27),
param_grid=param_test4,
scoring='roc_auc',
n_jobs=4,
iid=False,
cv=5)
gridsearch4.fit(X_train, y_train)
print(gridsearch4.cv_results_["mean_test_score"]) # 不同参数下交叉验证结果
print(gridsearch4.best_params_)
print(gridsearch4.best_score_)
param_test41 = {
'subsample': [i/10.0 for i in range(6, 10)],
'colsample_bytree': [i/10.0 for i in range(6, 10)]
}
gridsearch41 = GridSearchCV(estimator = XGBClassifier(
learning_rate=0.1,
n_estimators=205,
max_depth=6,
min_child_weight=1,
gamma=0,
subsample=0.8,
colsample_bytree=0.8,
objective='binary:logistic',
nthread=4,
scale_pos_weight=1,
seed=27),
param_grid=param_test41,
scoring='roc_auc',
n_jobs=4,
iid=False,
cv=5)
gridsearch41.fit(X_train, y_train)
print(gridsearch41.cv_results_["mean_test_score"]) # 不同参数下交叉验证结果
print(gridsearch41.best_params_)
print(gridsearch41.best_score_)
# #### 正则化参数调优
param_test5 = {
'reg_alpha': [1e-5, 1e-4, 1e-3, 0.01, 0.05, 0.1, 0.5, 1]
}
gridsearch5 = GridSearchCV(estimator = XGBClassifier(
learning_rate=0.1,
n_estimators=200,
max_depth=6,
min_child_weight=1,
gamma=0.2,
reg_alpha=0,
subsample=0.7,
colsample_bytree=0.9,
objective='binary:logistic',
nthread=4,
scale_pos_weight=1,
seed=27),
param_grid=param_test5,
scoring='roc_auc',
n_jobs=4,
iid=False,
cv=5)
gridsearch5.fit(X_train, y_train)
print(gridsearch5.cv_results_["mean_test_score"]) # 不同参数下交叉验证结果
print(gridsearch5.best_params_)
print(gridsearch5.best_score_)
# #### 降低学习率
param_test6 = {
'learning_rate': [x/100 for x in range(1, 10)]
}
gridsearch6 = GridSearchCV(estimator = XGBClassifier(
learning_rate=0.1,
n_estimators=200,
max_depth=6,
min_child_weight=1,
gamma=0.2,
subsample=0.7,
a腾讯棋牌类游戏都有这种机制:你充钱它就会记录下来,在冲后的一段时间内它会先让你赢,过一段时间后你会发现赢了多少就会输多少。我玩这个游戏先冲了一点钱,慢慢的赢了两千万吧,然后去富商场玩(还不是尊爵场)真钱麻将游戏,结果对家胡了一个三根杠开花的海底捞月的清金钩钓,瞬间输光。当你输光后就会心里痒痒,想要再次冲钱,结果发现只是又是按照流程走了一遍已赞过已踩过收起热心网友2018-10-07 |