Hotel Cancellation EDA & Predictions

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

data = pd.read_csv('TrainData.csv')
#booking status is y

data.head()

	no_of_adults	no_of_children	no_of_weekend_nights	no_of_week_nights	type_of_meal_plan	lead_time	arrival_year	arrival_month	arrival_date	market_segment_type	avg_price_per_room	no_of_special_requests	booking_status
0	2	0	1	4	0	118	2017	12	28	1	110.80	2	0
1	2	1	0	2	0	17	2018	4	14	1	145.00	0	1
2	1	0	1	5	0	349	2018	10	4	0	96.67	0	1
3	1	0	2	4	0	69	2018	6	12	0	120.00	0	1
4	2	0	0	4	1	11	2018	1	20	1	69.50	1	0

data.describe()

	no_of_adults	no_of_children	no_of_weekend_nights	no_of_week_nights	type_of_meal_plan	required_car_parking_space	room_type_reserved	lead_time	arrival_year	arrival_month	arrival_date	market_segment_type	repeated_guest	no_of_previous_cancellations	no_of_previous_bookings_not_canceled	avg_price_per_room	no_of_special_requests	booking_status
count	18137.000000	18137.000000	18137.000000	18137.000000	18137.000000	18137.000000	18137.000000	18137.000000	18137.000000	18137.000000	18137.000000	18137.000000	18137.000000	18137.000000	18137.000000	18137.000000	18137.000000	18137.000000
mean	1.846777	0.107515	0.811104	2.208965	0.318465	0.031648	0.336770	85.377405	2017.820698	7.432762	15.660804	0.806197	0.025087	0.022440	0.151403	103.478868	0.617522	0.327618
std	0.516020	0.408901	0.873470	1.426365	0.629140	0.175066	0.772865	86.611736	0.383616	3.076999	8.772788	0.645972	0.156393	0.370078	1.714135	35.474103	0.787941	0.469357
min	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	2017.000000	1.000000	1.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000
25%	2.000000	0.000000	0.000000	1.000000	0.000000	0.000000	0.000000	17.000000	2018.000000	5.000000	8.000000	0.000000	0.000000	0.000000	0.000000	80.300000	0.000000	0.000000
50%	2.000000	0.000000	1.000000	2.000000	0.000000	0.000000	0.000000	57.000000	2018.000000	8.000000	16.000000	1.000000	0.000000	0.000000	0.000000	99.450000	0.000000	0.000000
75%	2.000000	0.000000	2.000000	3.000000	0.000000	0.000000	0.000000	127.000000	2018.000000	10.000000	23.000000	1.000000	0.000000	0.000000	0.000000	120.270000	1.000000	1.000000
max	4.000000	9.000000	7.000000	17.000000	3.000000	1.000000	6.000000	443.000000	2018.000000	12.000000	31.000000	4.000000	1.000000	13.000000	58.000000	540.000000	5.000000	1.000000

import seaborn as sns
plt.figure(figsize = (14,7))
sns.heatmap(data.corr().round(2), annot =True, cmap='YlGnBu')

#Negative corr (arrival year & month) market segment and lead time
#corr( repeat guest & no previous not canceled / Market segment & repeat guest) num of children / room type

<AxesSubplot: >

png

#check for null data & check data types
#change dates to date format
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18137 entries, 0 to 18136
Data columns (total 18 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 no_of_adults                          18137 non-null  int64  
 no_of_children                        18137 non-null  int64  
 no_of_weekend_nights                  18137 non-null  int64  
 no_of_week_nights                     18137 non-null  int64  
 type_of_meal_plan                     18137 non-null  int64  
 required_car_parking_space            18137 non-null  int64  
 room_type_reserved                    18137 non-null  int64  
 lead_time                             18137 non-null  int64  
 arrival_year                          18137 non-null  int64  
 arrival_month                         18137 non-null  int64  
arrival_date                          18137 non-null  int64  
market_segment_type                   18137 non-null  int64  
repeated_guest                        18137 non-null  int64  
no_of_previous_cancellations          18137 non-null  int64  
no_of_previous_bookings_not_canceled  18137 non-null  int64  
avg_price_per_room                    18137 non-null  float64
no_of_special_requests                18137 non-null  int64  
booking_status                        18137 non-null  int64  
dtypes: float64(1), int64(17)
memory usage: 2.5 MB

#check for nulls
data.isnull().sum()/len(data)

no_of_adults                            0.0
no_of_children                          0.0
no_of_weekend_nights                    0.0
no_of_week_nights                       0.0
type_of_meal_plan                       0.0
required_car_parking_space              0.0
room_type_reserved                      0.0
lead_time                               0.0
arrival_year                            0.0
arrival_month                           0.0
arrival_date                            0.0
market_segment_type                     0.0
repeated_guest                          0.0
no_of_previous_cancellations            0.0
no_of_previous_bookings_not_canceled    0.0
avg_price_per_room                      0.0
no_of_special_requests                  0.0
booking_status                          0.0
dtype: float64

#check for outliers but only on contiuous numbers
plt.title("Lead_Time", fontdict = {'fontsize': 20})
sns.boxplot(x=data["lead_time"], palette = 'YlGnBu')

<AxesSubplot: title={'center': 'Lead_Time'}, xlabel='lead_time'>

png

#check for outliers but only on contiuous numbers
plt.title("Avg Price Per Room", fontdict = {'fontsize': 20})
sns.boxplot(x=data["avg_price_per_room"],palette = 'YlGnBu')

<AxesSubplot: title={'center': 'Avg Price Per Room'}, xlabel='avg_price_per_room'>

png

sns.jointplot(x='lead_time', y='avg_price_per_room', data = data,palette = 'YlGnBu', kind='kde', fill=True)

<seaborn.axisgrid.JointGrid at 0x24a23e46df0>

png

sns.jointplot(x='lead_time', y='avg_price_per_room', data = data, cmap = 'YlGnBu', kind='hex')

<seaborn.axisgrid.JointGrid at 0x24a23d0ffa0>

png

# sns.pairplot(data)

# penguins = sns.load_dataset("penguins")
# #sns.pairplot(penguins)
# penguins.info()

plt.figure(figsize = (20, 25))
plt.subplot(5,2,3)
sns.kdeplot(x='lead_time', hue='booking_status', palette = 'Set2', fill=True, data=data)

plt.subplot(5,2,4)
sns.kdeplot(x='arrival_month', hue='booking_status', palette = 'Set2', fill=True, data=data)

plt.subplot(5,2,1)
sns.kdeplot(x='arrival_date', hue='booking_status', palette = 'Set2', fill=True, data=data)

plt.subplot(5,2,2)
sns.kdeplot(x = 'booking_status', hue = 'repeated_guest', palette = 'Set2', fill=True, data = data)

<AxesSubplot: xlabel='booking_status', ylabel='Density'>

png

data = data.rename(columns={'arrival_year': 'year', 'arrival_month': 'month', 'arrival_date': 'day'})

data['date'] = pd.to_datetime(data[['year', 'month', 'day']],format='%Y-%m-%d', errors='coerce')

data

	no_of_adults	no_of_children	no_of_weekend_nights	no_of_week_nights	type_of_meal_plan	required_car_parking_space	room_type_reserved	lead_time	year	month	day	market_segment_type	repeated_guest	no_of_previous_cancellations	no_of_previous_bookings_not_canceled	avg_price_per_room	no_of_special_requests	booking_status	date
0	2	0	1	4	0	0	0	118	2017	12	28	1	0	0	0	110.80	2	0	2017-12-28
1	2	1	0	2	0	0	0	17	2018	4	14	1	0	0	0	145.00	0	1	2018-04-14
2	1	0	1	5	0	0	0	349	2018	10	4	0	0	0	0	96.67	0	1	2018-10-04
3	1	0	2	4	0	0	0	69	2018	6	12	0	0	0	0	120.00	0	1	2018-06-12
4	2	0	0	4	1	0	0	11	2018	1	20	1	0	0	0	69.50	1	0	2018-01-20
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
18132	1	0	0	2	0	0	0	103	2018	4	19	0	0	0	0	115.00	0	1	2018-04-19
18133	2	0	0	3	0	0	0	129	2018	8	10	1	0	0	0	88.01	1	0	2018-08-10
18134	2	0	0	1	0	0	0	90	2018	7	13	1	0	0	0	105.30	0	1	2018-07-13
18135	2	0	0	3	0	0	0	18	2018	11	10	1	1	0	1	123.33	1	0	2018-11-10
18136	1	0	1	1	0	0	0	159	2018	4	9	0	0	0	0	65.00	0	0	2018-04-09

18137 rows × 19 columns

#make each reservation a 1 to count
data['reservations'] = 1
# # Use GroupBy() to compute the sum
data1 = data.groupby('date').sum()
data1

	no_of_adults	no_of_children	no_of_weekend_nights	no_of_week_nights	type_of_meal_plan	required_car_parking_space	room_type_reserved	lead_time	year	month	day	market_segment_type	repeated_guest	no_of_previous_cancellations	no_of_previous_bookings_not_canceled	avg_price_per_room	no_of_special_requests	booking_status	reservations
date
2017-07-01	61	0	4	76	60	0	0	8001	68578	238	34	10	0	0	0	3134.00	3	25	34
2017-07-02	2	0	0	3	0	0	0	79	2017	7	2	1	0	0	0	76.50	1	1	1
2017-07-03	2	0	1	2	0	0	0	80	2017	7	3	1	0	0	0	76.50	1	1	1
2017-07-04	1	0	1	1	0	0	0	106	2017	7	4	1	0	0	0	68.00	0	1	1
2017-07-05	2	0	2	1	1	0	0	71	2017	7	5	1	0	0	0	55.80	0	1	1
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
2018-12-27	164	8	38	268	9	0	31	7408	157404	936	2106	33	1	1	8	6705.57	52	10	78
2018-12-28	73	8	19	140	11	1	22	4043	74666	444	1036	39	1	6	58	4205.69	35	10	37
2018-12-29	108	13	66	171	13	0	29	9223	119062	708	1711	50	0	0	0	6100.70	48	14	59
2018-12-30	98	10	44	100	6	1	23	5506	94846	564	1410	35	0	0	0	6257.65	39	10	47
2018-12-31	51	4	39	42	8	1	7	3194	48432	288	744	16	0	0	0	2947.58	10	7	24

548 rows × 19 columns

from pandas.api.types import CategoricalDtype

cat_type = CategoricalDtype(categories=['Monday', 'Tuesday',
                                       'Wednesday',
                                       'Thursday', 'Friday',
                                       'Saturday', 'Sunday'],
                           ordered=True)

data.index

RangeIndex(start=0, stop=18137, step=1)

def create_features(df, label=None):
    """
    Creates time series features from datetime index.
    """
    df = data1.copy()
    df['date'] = df.index
    df['dayofweek'] = df['date'].dt.dayofweek
    df['weekday'] = df['date'].dt.day_name()
    df['weekday'] = df['weekday'].astype(cat_type)
    df['quarter'] = df['date'].dt.quarter
    df['month'] = df['date'].dt.month
    df['year'] = df['date'].dt.year
    df['dayofyear'] = df['date'].dt.dayofyear
    df['dayofmonth'] = df['date'].dt.day
    df['weekofyear'] = df['date'].dt.isocalendar().week
    #df['weekofyear'] = df['date'].dt.weekofyear
    df['date_offset'] = (df.date.dt.month*100 + df.date.dt.day - 320)%1300
    
    df['season'] = pd.cut(df['date_offset'], [0, 300, 600, 900, 1300],
                         labels=['Spring', 'Summer', 'Fall', 'Winter'])


    X = df[['dayofweek','quarter','month','year',
           'dayofyear','dayofmonth','weekofyear', 'weekday', 'season']]
    if label:
        y = df[label]
        return X, y
    return X

X, y = create_features(data1, label='reservations')

features_and_target = pd.concat([X, y], axis=1)

features_and_target.head(20)

	dayofweek	quarter	month	year	dayofyear	dayofmonth	weekofyear	weekday	season	reservations
date
2017-07-01	5	3	7	2017	182	1	26	Saturday	Summer	34
2017-07-02	6	3	7	2017	183	2	26	Sunday	Summer	1
2017-07-03	0	3	7	2017	184	3	27	Monday	Summer	1
2017-07-04	1	3	7	2017	185	4	27	Tuesday	Summer	1
2017-07-05	2	3	7	2017	186	5	27	Wednesday	Summer	1
2017-07-06	3	3	7	2017	187	6	27	Thursday	Summer	6
2017-07-07	4	3	7	2017	188	7	27	Friday	Summer	8
2017-07-08	5	3	7	2017	189	8	27	Saturday	Summer	3
2017-07-10	0	3	7	2017	191	10	28	Monday	Summer	3
2017-07-11	1	3	7	2017	192	11	28	Tuesday	Summer	14
2017-07-12	2	3	7	2017	193	12	28	Wednesday	Summer	2
2017-07-13	3	3	7	2017	194	13	28	Thursday	Summer	6
2017-07-14	4	3	7	2017	195	14	28	Friday	Summer	1
2017-07-15	5	3	7	2017	196	15	28	Saturday	Summer	4
2017-07-16	6	3	7	2017	197	16	28	Sunday	Summer	3
2017-07-17	0	3	7	2017	198	17	29	Monday	Summer	29
2017-07-18	1	3	7	2017	199	18	29	Tuesday	Summer	11
2017-07-19	2	3	7	2017	200	19	29	Wednesday	Summer	2
2017-07-20	3	3	7	2017	201	20	29	Thursday	Summer	2
2017-07-21	4	3	7	2017	202	21	29	Friday	Summer	1

plt.figure(figsize = (20,25))


plt.subplot(3,2,1)
plt.gca().set_title('Bookings By Month')
sns.countplot(x = 'month', palette = 'YlGnBu', data = data.loc[data['year'] == 2018])


plt.subplot(3,2,2)
plt.gca().set_title('Variable market_segment_type')
sns.countplot(x = 'market_segment_type', palette = 'YlGnBu', data = data.loc[data['year'] == 2018])
#Need to label market segments(Offline, Online, corporate, aviation, complementary)


plt.subplot(3,2,3)
plt.gca().set_title('Variable booking_status')
sns.countplot(x = 'booking_status', palette = 'YlGnBu', data = data.loc[data['year'] == 2018])

plt.subplot(3,2,4)
plt.gca().set_title('Variable no_of_children')
sns.countplot(x = 'no_of_children', palette = 'YlGnBu', data = data.loc[data['year'] == 2018])

<AxesSubplot: title={'center': 'Variable no_of_children'}, xlabel='no_of_children', ylabel='count'>

png

plt.figure(figsize = (20, 25))
plt.suptitle("Booking Status By Month",fontweight="bold", fontsize=20)

plt.subplot(5,2,1)
sns.countplot(x = 'booking_status', hue = 'month', palette = 'YlGnBu', data = data.loc[data['year'] == 2018])

<AxesSubplot: xlabel='booking_status', ylabel='count'>

png

#change calceled from 0 and 1

plt.figure(figsize = (20, 25))
plt.suptitle("Analysis Of Variable booking_status",fontweight="bold", fontsize=20)

plt.subplot(5,2,1)
sns.countplot(x = 'booking_status', hue = 'no_of_adults', palette = 'YlGnBu', data = data.loc[data['year'] == 2018])

plt.subplot(5,2,2)
sns.countplot(x = 'booking_status', hue = 'no_of_children', palette = 'YlGnBu', data = data.loc[data['year'] == 2018])

plt.subplot(5,2,3)
sns.countplot(x = 'booking_status', hue = 'no_of_weekend_nights',palette = 'YlGnBu', data = data.loc[data['year'] == 2018])

plt.subplot(5,2,4)
sns.countplot(x = 'booking_status', hue = 'market_segment_type', palette = 'YlGnBu', data = data.loc[data['year'] == 2018])

plt.subplot(5,2,5)
sns.countplot(x = 'booking_status', hue = 'type_of_meal_plan', palette = 'YlGnBu', data = data.loc[data['year'] == 2018])

plt.subplot(5,2,6)
sns.countplot(x = 'booking_status', hue = 'required_car_parking_space', palette = 'YlGnBu', data = data.loc[data['year'] == 2018])

plt.subplot(5,2,7)
sns.countplot(x = 'booking_status', hue = 'room_type_reserved', palette = 'YlGnBu', data = data.loc[data['year'] == 2018])

plt.subplot(5,2,8)
sns.countplot(x = 'booking_status', hue = 'year', palette = 'YlGnBu', data = data.loc[data['year'] == 2018])

<AxesSubplot: xlabel='booking_status', ylabel='count'>

png

sns.scatterplot(data=data, x="lead_time", y="avg_price_per_room", palette = 'YlGnBu', hue = 'booking_status')

<AxesSubplot: xlabel='lead_time', ylabel='avg_price_per_room'>

png

fig, ax = plt.subplots(figsize=(10,5))
sns.boxplot(data = features_and_target.dropna(),
            x='weekday',
            y='reservations',
            hue='season',
            ax=ax,
            linewidth=1,
            palette='YlGnBu')

ax.set_title('Number of Reservations by Day of Week')
ax.set_xlabel('Day of Week')
ax.set_ylabel('reservations')
ax.legend(bbox_to_anchor=(1,1))
plt.show()

png

def create_features(df, label=None):
    """
    Creates time series features from datetime index.
    """
    df = data1.copy()
    df['date'] = df.index
    df['dayofweek'] = df['date'].dt.dayofweek
    df['weekday'] = df['date'].dt.day_name()
    df['weekday'] = df['weekday'].astype(cat_type)
    df['quarter'] = df['date'].dt.quarter
    df['month'] = df['date'].dt.month
    df['year'] = df['date'].dt.year
    df['dayofyear'] = df['date'].dt.dayofyear
    df['dayofmonth'] = df['date'].dt.day
    df['weekofyear'] = df['date'].dt.isocalendar().week
    #df['weekofyear'] = df['date'].dt.weekofyear
    df['date_offset'] = (df.date.dt.month*100 + df.date.dt.day - 320)%1300
    
    df['season'] = pd.cut(df['date_offset'], [0, 300, 600, 900, 1300],
                         labels=['Spring', 'Summer', 'Fall', 'Winter'])


    X = df[['dayofweek','quarter','month','year',
           'dayofyear','dayofmonth','weekofyear', 'weekday', 'season']]
    if label:
        y = df[label]
        return X, y
    return X

X, y = create_features(data1, label='booking_status')

features_and_target = pd.concat([X, y], axis=1)

features_and_target.head(20)

	dayofweek	quarter	month	year	dayofyear	dayofmonth	weekofyear	weekday	season	booking_status
date
2017-07-01	5	3	7	2017	182	1	26	Saturday	Summer	25
2017-07-02	6	3	7	2017	183	2	26	Sunday	Summer	1
2017-07-03	0	3	7	2017	184	3	27	Monday	Summer	1
2017-07-04	1	3	7	2017	185	4	27	Tuesday	Summer	1
2017-07-05	2	3	7	2017	186	5	27	Wednesday	Summer	1
2017-07-06	3	3	7	2017	187	6	27	Thursday	Summer	6
2017-07-07	4	3	7	2017	188	7	27	Friday	Summer	6
2017-07-08	5	3	7	2017	189	8	27	Saturday	Summer	1
2017-07-10	0	3	7	2017	191	10	28	Monday	Summer	2
2017-07-11	1	3	7	2017	192	11	28	Tuesday	Summer	4
2017-07-12	2	3	7	2017	193	12	28	Wednesday	Summer	2
2017-07-13	3	3	7	2017	194	13	28	Thursday	Summer	5
2017-07-14	4	3	7	2017	195	14	28	Friday	Summer	1
2017-07-15	5	3	7	2017	196	15	28	Saturday	Summer	4
2017-07-16	6	3	7	2017	197	16	28	Sunday	Summer	3
2017-07-17	0	3	7	2017	198	17	29	Monday	Summer	15
2017-07-18	1	3	7	2017	199	18	29	Tuesday	Summer	10
2017-07-19	2	3	7	2017	200	19	29	Wednesday	Summer	2
2017-07-20	3	3	7	2017	201	20	29	Thursday	Summer	1
2017-07-21	4	3	7	2017	202	21	29	Friday	Summer	1

fig, ax = plt.subplots(figsize=(10,5))
sns.boxplot(data = features_and_target.dropna(),
            x='weekday',
            y='booking_status',
            hue='season',
            ax=ax,
            linewidth=1,
            palette='YlGnBu')

ax.set_title('Number of Cancelations by Day of Week')
ax.set_xlabel('Day of Week')
ax.set_ylabel('Cancelations')
ax.legend(bbox_to_anchor=(1,1))
plt.show()

png

data_2018 = features_and_target.loc[features_and_target['year'] == 2018]
data_2018.head()

	dayofweek	quarter	month	year	dayofyear	dayofmonth	weekofyear	weekday	season	booking_status
date
2018-01-01	0	1	1	2018	1	1	1	Monday	Winter	0
2018-01-02	1	1	1	2018	2	2	1	Tuesday	Winter	0
2018-01-03	2	1	1	2018	3	3	1	Wednesday	Winter	0
2018-01-04	3	1	1	2018	4	4	1	Thursday	Winter	0
2018-01-05	4	1	1	2018	5	5	1	Friday	Winter	0

fig, ax = plt.subplots(figsize=(10,5))
sns.boxplot(data = data_2018.dropna(),
            x='weekday',
            y='booking_status',
            hue='season',
            ax=ax,
            linewidth=1,
            palette='YlGnBu')

ax.set_title('Number of Cancelations by Day of Week 2018 Only')
ax.set_xlabel('Day of Week')
ax.set_ylabel('Cancelations')
ax.legend(bbox_to_anchor=(1,1))
plt.show()

png

fig, ax = plt.subplots(figsize=(15,5))
sns.boxplot(data = data_2018.dropna(),
            x='weekofyear',
            y='booking_status',
            hue='season',
            ax=ax,
            linewidth=1,
            palette='YlGnBu')

ax.set_title('Number of Cancelations by Day of Week 2018 Only')
ax.set_xlabel('Day of Week')
ax.set_ylabel('Cancelations')
ax.legend(bbox_to_anchor=(1,1))
plt.show()

png

date_data = features_and_target.copy()

data1 = data.merge(date_data, on = 'date')
data1.head()

	no_of_adults	no_of_children	no_of_weekend_nights	no_of_week_nights	type_of_meal_plan	required_car_parking_space	room_type_reserved	lead_time	year_x	month_x	...	dayofweek	quarter	month_y	year_y	dayofyear	dayofmonth	weekofyear	weekday	season	booking_status_y
0	2	0	1	4	0	0	0	118	2017	12	...	3	4	12	2017	362	28	52	Thursday	Winter	2
1	2	0	1	1	0	0	0	21	2017	12	...	3	4	12	2017	362	28	52	Thursday	Winter	2
2	2	0	1	1	0	1	2	42	2017	12	...	3	4	12	2017	362	28	52	Thursday	Winter	2
3	2	0	1	3	1	0	0	61	2017	12	...	3	4	12	2017	362	28	52	Thursday	Winter	2
4	2	2	1	0	1	0	3	35	2017	12	...	3	4	12	2017	362	28	52	Thursday	Winter	2

5 rows × 30 columns

data1.describe().T

	count	mean	std	min	25%	50%	75%	max
no_of_adults	18116.0	1.847262	0.51574	0.0	2.0	2.0	2.0	4.0
no_of_children	18116.0	0.107474	0.408828	0.0	0.0	0.0	0.0	9.0
no_of_weekend_nights	18116.0	0.810775	0.873802	0.0	0.0	1.0	2.0	7.0
no_of_week_nights	18116.0	2.208931	1.426151	0.0	1.0	2.0	3.0	17.0
type_of_meal_plan	18116.0	0.318613	0.629172	0.0	0.0	0.0	0.0	3.0
required_car_parking_space	18116.0	0.031629	0.175016	0.0	0.0	0.0	0.0	1.0
room_type_reserved	18116.0	0.336553	0.772457	0.0	0.0	0.0	0.0	6.0
lead_time	18116.0	85.433429	86.63484	0.0	17.0	57.0	127.0	443.0
year_x	18116.0	2017.82049	0.383789	2017.0	2018.0	2018.0	2018.0	2018.0
month_x	18116.0	7.439059	3.073214	1.0	5.0	8.0	10.0	12.0
day	18116.0	15.645341	8.7661	1.0	8.0	16.0	23.0	31.0
market_segment_type	18116.0	0.805917	0.645484	0.0	0.0	1.0	1.0	4.0
repeated_guest	18116.0	0.024895	0.15581	0.0	0.0	0.0	0.0	1.0
no_of_previous_cancellations	18116.0	0.022411	0.370221	0.0	0.0	0.0	0.0	13.0
no_of_previous_bookings_not_canceled	18116.0	0.150364	1.711651	0.0	0.0	0.0	0.0	58.0
avg_price_per_room	18116.0	103.498054	35.461471	0.0	80.3	99.45	120.285	540.0
no_of_special_requests	18116.0	0.617852	0.788105	0.0	0.0	0.0	1.0	5.0
booking_status_x	18116.0	0.327832	0.469436	0.0	0.0	0.0	1.0	1.0
reservations	18116.0	1.0	0.0	1.0	1.0	1.0	1.0	1.0
dayofweek	18116.0	3.090197	2.062916	0.0	1.0	3.0	5.0	6.0
quarter	18116.0	2.801391	1.033189	1.0	2.0	3.0	4.0	4.0
month_y	18116.0	7.439059	3.073214	1.0	5.0	8.0	10.0	12.0
year_y	18116.0	2017.82049	0.383789	2017.0	2018.0	2018.0	2018.0	2018.0
dayofyear	18116.0	210.667311	93.777748	1.0	135.0	226.0	286.0	365.0
dayofmonth	18116.0	15.645341	8.7661	1.0	8.0	16.0	23.0	31.0
weekofyear	18116.0	30.416483	13.389291	1.0	20.0	33.0	41.0	52.0
booking_status_y	18116.0	17.444138	15.874724	0.0	6.0	14.0	23.0	85.0

data1['booking_status_x']

      0
      0
      0
      0
      0
        ..
  1
  0
  1
  0
  1
Name: booking_status_x, Length: 18116, dtype: int64

#inspect correlation between label and features
print(data1.corr()["booking_status_x"].sort_values(ascending=False))

booking_status_x                        1.000000
lead_time                               0.434283
booking_status_y                        0.355568
year_y                                  0.183568
year_x                                  0.183568
avg_price_per_room                      0.145339
no_of_week_nights                       0.096321
no_of_adults                            0.093965
type_of_meal_plan                       0.076771
no_of_weekend_nights                    0.061341
no_of_children                          0.035009
dayofweek                               0.028634
room_type_reserved                      0.021954
day                                     0.012104
dayofmonth                              0.012104
quarter                                 0.000175
weekofyear                             -0.011193
dayofyear                              -0.011705
month_y                                -0.012305
month_x                                -0.012305
no_of_previous_cancellations           -0.032113
market_segment_type                    -0.045607
no_of_previous_bookings_not_canceled   -0.060390
required_car_parking_space             -0.092620
repeated_guest                         -0.107060
no_of_special_requests                 -0.248649
reservations                                 NaN
Name: booking_status_x, dtype: float64


C:\Users\Brett\AppData\Local\Temp\ipykernel_21676\86188331.py:2: FutureWarning: The default value of numeric_only in DataFrame.corr is deprecated. In a future version, it will default to False. Select only valid columns or specify the value of numeric_only to silence this warning.
  print(data1.corr()["booking_status_x"].sort_values(ascending=False))

data1 = data1.drop(['reservations', 'booking_status_y'], axis = 1)

data1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 18116 entries, 0 to 18115
Data columns (total 28 columns):
 #   Column                                Non-Null Count  Dtype         
---  ------                                --------------  -----         
 no_of_adults                          18116 non-null  int64         
 no_of_children                        18116 non-null  int64         
 no_of_weekend_nights                  18116 non-null  int64         
 no_of_week_nights                     18116 non-null  int64         
 type_of_meal_plan                     18116 non-null  int64         
 required_car_parking_space            18116 non-null  int64         
 room_type_reserved                    18116 non-null  int64         
 lead_time                             18116 non-null  int64         
 year_x                                18116 non-null  int64         
 month_x                               18116 non-null  int64         
day                                   18116 non-null  int64         
market_segment_type                   18116 non-null  int64         
repeated_guest                        18116 non-null  int64         
no_of_previous_cancellations          18116 non-null  int64         
no_of_previous_bookings_not_canceled  18116 non-null  int64         
avg_price_per_room                    18116 non-null  float64       
no_of_special_requests                18116 non-null  int64         
booking_status_x                      18116 non-null  int64         
date                                  18116 non-null  datetime64[ns]
dayofweek                             18116 non-null  int64         
quarter                               18116 non-null  int64         
month_y                               18116 non-null  int64         
year_y                                18116 non-null  int64         
dayofyear                             18116 non-null  int64         
dayofmonth                            18116 non-null  int64         
weekofyear                            18116 non-null  UInt32        
weekday                               18116 non-null  category      
season                                18052 non-null  category      
dtypes: UInt32(1), category(2), datetime64[ns](1), float64(1), int64(23)
memory usage: 3.7 MB

X = data1[['avg_price_per_room','no_of_special_requests','market_segment_type','lead_time','required_car_parking_space']]
y = data1['booking_status_x']
print(X.head(4))
print(y.head(4))

   avg_price_per_room  no_of_special_requests  market_segment_type  lead_time  \
             110.8                       2                    1        118   
             107.0                       2                    1         21   
              91.5                       0                    1         42   
              92.5                       1                    1         61   

   required_car_parking_space  
                         0  
                         0  
                         1  
                         0  
  0
  0
  0
  0
Name: booking_status_x, dtype: int64

#random forest model
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

#random forest model
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,random_state=101)
rfc = RandomForestClassifier(n_estimators=400)
rfc.fit(X_train, y_train)
rfc_pred = rfc.predict(X_test)
print(accuracy_score(y_test,rfc_pred)*100)
k=accuracy_score(y_test,rfc_pred)*100
print('\n')
print(confusion_matrix(y_test,rfc_pred))
print('\n')
print(classification_report(y_test,rfc_pred))

85.22539098436063


[[3313  339]
 [ 464 1319]]


              precision    recall  f1-score   support

           0       0.88      0.91      0.89      3652
           1       0.80      0.74      0.77      1783

    accuracy                           0.85      5435
   macro avg       0.84      0.82      0.83      5435
weighted avg       0.85      0.85      0.85      5435

#KNN
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix

scale= StandardScaler()
scale.fit(X)
scaled_features = scale.transform(X)
df_feat = pd.DataFrame(X)
#X = df[['avg_price_per_room','no_of_special_requests','market_segment_type','arrival_month','lead_time']]
y = data1['booking_status_x']
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.80,random_state=101)
knn = KNeighborsClassifier(n_neighbors=1)
pred = knn.fit(X_train,y_train)
pred = knn.predict(X_test)
print('KNN Accuracy score is: ',accuracy_score(y_test,pred)*100)
print('\n')
print(confusion_matrix(y_test,pred))
print('\n')
print(classification_report(y_test,pred))

KNN Accuracy score is:  74.27723728696613


[[7752 2017]
 [1711 3013]]


              precision    recall  f1-score   support

           0       0.82      0.79      0.81      9769
           1       0.60      0.64      0.62      4724

    accuracy                           0.74     14493
   macro avg       0.71      0.72      0.71     14493
weighted avg       0.75      0.74      0.74     14493

plt.figure(figsize=(14,6))
error = []
for i in range(1,20):
    knn = KNeighborsClassifier(n_neighbors=i)
    knn.fit(X_train,y_train)
    pred_i = knn.predict(X_test)
    error.append(np.mean(pred_i != y_test))

sns.set_style('whitegrid')
plt.plot(range(1,20),error,color='blue', linestyle='dashed', marker='o',
         markerfacecolor='red', markersize=10)
plt.title('Error Rate vs. K Value')
plt.xlabel('K')
plt.ylabel('Error Rate')

Text(0, 0.5, 'Error Rate')

png

knn = KNeighborsClassifier(n_neighbors=2)
pred = knn.fit(X_train,y_train)
pred = knn.predict(X_test)
#print('LogisticRegression score is: ',np.round(model.score(y_test,pred)*100,decimals=2))
print('\n')
print('Best KNN Accuracy score is: ',accuracy_score(y_test,pred)*100)
print('\n')
m=accuracy_score(y_test,pred)*100
print(confusion_matrix(y_test,pred))
print('\n')
print(classification_report(y_test,pred))

Best KNN Accuracy score is:  77.95487476712896


[[9052  717]
 [2478 2246]]


              precision    recall  f1-score   support

           0       0.79      0.93      0.85      9769
           1       0.76      0.48      0.58      4724

    accuracy                           0.78     14493
   macro avg       0.77      0.70      0.72     14493
weighted avg       0.78      0.78      0.76     14493

#logistical Model

from sklearn.linear_model import LogisticRegression

y = data1['booking_status_x']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.40 ,random_state=101)
logmodel = LogisticRegression()
logmodel.fit(X_train,y_train)
predictions = logmodel.predict(X_test)
print('\n')
print('Accuracy score is: ',accuracy_score(y_test,predictions)*100)
p=accuracy_score(y_test,predictions)*100

print('\n')
print(confusion_matrix(y_test,predictions))
print('\n')
print(classification_report(y_test,predictions))

Accuracy score is:  78.52904650200084


[[4359  522]
 [1034 1332]]


              precision    recall  f1-score   support

           0       0.81      0.89      0.85      4881
           1       0.72      0.56      0.63      2366

    accuracy                           0.79      7247
   macro avg       0.76      0.73      0.74      7247
weighted avg       0.78      0.79      0.78      7247

sns.set_context("poster", font_scale = .75)
cm = confusion_matrix(y_test, predictions)
fig, ax = plt.subplots(figsize=(6, 5))
ax.imshow(cm)
ax.grid(False)
ax.xaxis.set(ticks=(0, 1), ticklabels=('Predicted 0s', 'Predicted 1s'))
ax.yaxis.set(ticks=(0, 1), ticklabels=('Actual 0s', 'Actual 1s'))
ax.set_ylim(1.5, -0.5)
for i in range(2):
    for j in range(2):
        ax.text(j, i, cm[i, j], ha='center', va='center', color='red')
plt.show()

png

import plotly.express as px
from plotly import graph_objects
from textwrap import wrap
import chart_studio.plotly as py

named_colorscales = px.colors.named_colorscales()
print("\n".join(wrap("".join('{:<12}'.format(c) for c in named_colorscales), 96)))

aggrnyl     agsunset    blackbody   bluered     blues       blugrn      bluyl       brwnyl
bugn        bupu        burg        burgyl      cividis     darkmint    electric    emrld
gnbu        greens      greys       hot         inferno     jet         magenta     magma
mint        orrd        oranges     oryel       peach       pinkyl      plasma      plotly3
pubu        pubugn      purd        purp        purples     purpor      rainbow     rdbu
rdpu        redor       reds        sunset      sunsetdark  teal        tealgrn     turbo
viridis     ylgn        ylgnbu      ylorbr      ylorrd      algae       amp         deep
dense       gray        haline      ice         matter      solar       speed       tempo
thermal     turbid      armyrose    brbg        earth       fall        geyser      prgn
piyg        picnic      portland    puor        rdgy        rdylbu      rdylgn      spectral
tealrose    temps       tropic      balance     curl        delta       oxy         edge
hsv         icefire     phase       twilight    mrybm       mygbm

!pip install chart-studio

Requirement already satisfied: chart-studio in c:\users\brett\anaconda3\envs\school\lib\site-packages (1.1.0)
Requirement already satisfied: plotly in c:\users\brett\anaconda3\envs\school\lib\site-packages (from chart-studio) (5.13.0)
Requirement already satisfied: requests in c:\users\brett\anaconda3\envs\school\lib\site-packages (from chart-studio) (2.28.1)
Requirement already satisfied: six in c:\users\brett\anaconda3\envs\school\lib\site-packages (from chart-studio) (1.16.0)
Requirement already satisfied: retrying>=1.3.3 in c:\users\brett\anaconda3\envs\school\lib\site-packages (from chart-studio) (1.3.4)
Requirement already satisfied: tenacity>=6.2.0 in c:\users\brett\anaconda3\envs\school\lib\site-packages (from plotly->chart-studio) (8.2.0)
Requirement already satisfied: urllib3<1.27,>=1.21.1 in c:\users\brett\anaconda3\envs\school\lib\site-packages (from requests->chart-studio) (1.26.14)
Requirement already satisfied: certifi>=2017.4.17 in c:\users\brett\anaconda3\envs\school\lib\site-packages (from requests->chart-studio) (2022.12.7)
Requirement already satisfied: idna<4,>=2.5 in c:\users\brett\anaconda3\envs\school\lib\site-packages (from requests->chart-studio) (3.4)
Requirement already satisfied: charset-normalizer<3,>=2 in c:\users\brett\anaconda3\envs\school\lib\site-packages (from requests->chart-studio) (2.0.4)

xgboost

import xgboost as xgb
from sklearn.model_selection import train_test_split

xg_cl = xgb.XGBClassifier(objective = 'binary:logistic', n_estimators = 10, seed = 123)
xg_cl.fit(X_train, y_train)

preds = xg_cl.predict(X_test)
accuracy = float(np.sum(preds==y_test))/y_test.shape[0]
b1 = accuracy*100
print('accuracy: %f' %(accuracy))

accuracy: 0.844901

import warnings
warnings.filterwarnings('ignore')

#inspect correlation between label and features

print(data1.corr()["booking_status_x"].sort_values(ascending=False))

booking_status_x                        1.000000
lead_time                               0.434283
year_y                                  0.183568
year_x                                  0.183568
avg_price_per_room                      0.145339
no_of_week_nights                       0.096321
no_of_adults                            0.093965
type_of_meal_plan                       0.076771
no_of_weekend_nights                    0.061341
no_of_children                          0.035009
dayofweek                               0.028634
room_type_reserved                      0.021954
day                                     0.012104
dayofmonth                              0.012104
quarter                                 0.000175
weekofyear                             -0.011193
dayofyear                              -0.011705
month_y                                -0.012305
month_x                                -0.012305
no_of_previous_cancellations           -0.032113
market_segment_type                    -0.045607
no_of_previous_bookings_not_canceled   -0.060390
required_car_parking_space             -0.092620
repeated_guest                         -0.107060
no_of_special_requests                 -0.248649
Name: booking_status_x, dtype: float64

X = data1[['avg_price_per_room',
           'no_of_special_requests',
           'market_segment_type',
           'lead_time',
           'required_car_parking_space',
          'repeated_guest',
          'no_of_week_nights',
          'no_of_adults']]
y = data1['booking_status_x']
print(X.head(4))
print(y.head(4))

   avg_price_per_room  no_of_special_requests  market_segment_type  lead_time  \
             110.8                       2                    1        118   
             107.0                       2                    1         21   
              91.5                       0                    1         42   
              92.5                       1                    1         61   

   required_car_parking_space  repeated_guest  no_of_week_nights  no_of_adults  
                         0               0                  4             2  
                         0               0                  1             2  
                         1               0                  1             2  
                         0               0                  3             2  
  0
  0
  0
  0
Name: booking_status_x, dtype: int64

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, stratify = y, random_state = 123)

Build Pipeline

!pip install category-encoders

Requirement already satisfied: category-encoders in c:\users\brett\anaconda3\envs\school\lib\site-packages (2.6.0)
Requirement already satisfied: scipy>=1.0.0 in c:\users\brett\anaconda3\envs\school\lib\site-packages (from category-encoders) (1.10.0)
Requirement already satisfied: patsy>=0.5.1 in c:\users\brett\anaconda3\envs\school\lib\site-packages (from category-encoders) (0.5.3)
Requirement already satisfied: pandas>=1.0.5 in c:\users\brett\anaconda3\envs\school\lib\site-packages (from category-encoders) (1.5.3)
Requirement already satisfied: numpy>=1.14.0 in c:\users\brett\anaconda3\envs\school\lib\site-packages (from category-encoders) (1.23.5)
Requirement already satisfied: statsmodels>=0.9.0 in c:\users\brett\anaconda3\envs\school\lib\site-packages (from category-encoders) (0.13.5)
Requirement already satisfied: scikit-learn>=0.20.0 in c:\users\brett\anaconda3\envs\school\lib\site-packages (from category-encoders) (1.2.1)
Requirement already satisfied: python-dateutil>=2.8.1 in c:\users\brett\anaconda3\envs\school\lib\site-packages (from pandas>=1.0.5->category-encoders) (2.8.2)
Requirement already satisfied: pytz>=2020.1 in c:\users\brett\anaconda3\envs\school\lib\site-packages (from pandas>=1.0.5->category-encoders) (2022.7)
Requirement already satisfied: six in c:\users\brett\anaconda3\envs\school\lib\site-packages (from patsy>=0.5.1->category-encoders) (1.16.0)
Requirement already satisfied: joblib>=1.1.1 in c:\users\brett\anaconda3\envs\school\lib\site-packages (from scikit-learn>=0.20.0->category-encoders) (1.2.0)
Requirement already satisfied: threadpoolctl>=2.0.0 in c:\users\brett\anaconda3\envs\school\lib\site-packages (from scikit-learn>=0.20.0->category-encoders) (3.1.0)
Requirement already satisfied: packaging>=21.3 in c:\users\brett\anaconda3\envs\school\lib\site-packages (from statsmodels>=0.9.0->category-encoders) (22.0)

conda install -c anaconda py-xgboost

Retrieving notices: ...working... done
Collecting package metadata (current_repodata.json): ...working... done
Solving environment: ...working... done

# All requested packages already installed.

Note: you may need to restart the kernel to use updated packages.

==> WARNING: A newer version of conda exists. <==
  current version: 23.1.0
  latest version: 23.7.3

Please update conda by running

    $ conda update -n base -c defaults conda

Or to minimize the number of packages updated during conda update use

     conda install conda=23.7.3

from sklearn.pipeline import Pipeline
from category_encoders.target_encoder import TargetEncoder
#from xgboost import XGBclassifier
import xgboost as xgb
#from xgboost import XGBclassifier

estimators = [
    ('encoder', TargetEncoder()),
    ('clf', xgb.XGBClassifier(random_state = 123))
]
pipe = Pipeline(steps = estimators)
pipe

Pipeline(steps=[('encoder', TargetEncoder()),
                ('clf',
                 XGBClassifier(base_score=None, booster=None, callbacks=None,
                               colsample_bylevel=None, colsample_bynode=None,
                               colsample_bytree=None,
                               early_stopping_rounds=None,
                               enable_categorical=False, eval_metric=None,
                               feature_types=None, gamma=None, gpu_id=None,
                               grow_policy=None, importance_type=None,
                               interaction_constraints=None, learning_rate=None,
                               max_bin=None, max_cat_threshold=None,
                               max_cat_to_onehot=None, max_delta_step=None,
                               max_depth=None, max_leaves=None,
                               min_child_weight=None, missing=nan,
                               monotone_constraints=None, n_estimators=100,
                               n_jobs=None, num_parallel_tree=None,
                               predictor=None, random_state=123, ...))])

In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.

Set up hyperparameter tuning

!pip install scikit-optimize

Requirement already satisfied: scikit-optimize in c:\users\brett\anaconda3\envs\school\lib\site-packages (0.9.0)
Requirement already satisfied: joblib>=0.11 in c:\users\brett\anaconda3\envs\school\lib\site-packages (from scikit-optimize) (1.2.0)
Requirement already satisfied: pyaml>=16.9 in c:\users\brett\anaconda3\envs\school\lib\site-packages (from scikit-optimize) (21.10.1)
Requirement already satisfied: scikit-learn>=0.20.0 in c:\users\brett\anaconda3\envs\school\lib\site-packages (from scikit-optimize) (1.2.1)
Requirement already satisfied: numpy>=1.13.3 in c:\users\brett\anaconda3\envs\school\lib\site-packages (from scikit-optimize) (1.23.5)
Requirement already satisfied: scipy>=0.19.1 in c:\users\brett\anaconda3\envs\school\lib\site-packages (from scikit-optimize) (1.10.0)
Requirement already satisfied: PyYAML in c:\users\brett\anaconda3\envs\school\lib\site-packages (from pyaml>=16.9->scikit-optimize) (6.0)
Requirement already satisfied: threadpoolctl>=2.0.0 in c:\users\brett\anaconda3\envs\school\lib\site-packages (from scikit-learn>=0.20.0->scikit-optimize) (3.1.0)

from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer

search_space = {
    'clf__max_depth': Integer(2,8),
    'clf__learning_rate': Real(0.001, 1.0, prior='log-uniform'),
    'clf__subsample': Real(0.5, 1.0),
    'clf__colsample_bytree': Real(0.5, 1.0),
    'clf__colsample_bylevel': Real(0.5, 1.0),
    'clf__colsample_bynode' : Real(0.5, 1.0),
    'clf__reg_alpha': Real(0.0, 10.0),
    'clf__reg_lambda': Real(0.0, 10.0),
    'clf__gamma': Real(0.0, 10.0)
}

opt = BayesSearchCV(pipe, search_space, cv=3, n_iter=10, scoring='roc_auc', random_state=8) 

#train xgboost model
opt.fit(X_train, y_train)

Warning: No categorical columns found. Calling 'transform' will only return input data.
Warning: No categorical columns found. Calling 'transform' will only return input data.
Warning: No categorical columns found. Calling 'transform' will only return input data.
Warning: No categorical columns found. Calling 'transform' will only return input data.
Warning: No categorical columns found. Calling 'transform' will only return input data.
Warning: No categorical columns found. Calling 'transform' will only return input data.
Warning: No categorical columns found. Calling 'transform' will only return input data.
Warning: No categorical columns found. Calling 'transform' will only return input data.
Warning: No categorical columns found. Calling 'transform' will only return input data.
Warning: No categorical columns found. Calling 'transform' will only return input data.
Warning: No categorical columns found. Calling 'transform' will only return input data.
Warning: No categorical columns found. Calling 'transform' will only return input data.
Warning: No categorical columns found. Calling 'transform' will only return input data.
Warning: No categorical columns found. Calling 'transform' will only return input data.
Warning: No categorical columns found. Calling 'transform' will only return input data.
Warning: No categorical columns found. Calling 'transform' will only return input data.
Warning: No categorical columns found. Calling 'transform' will only return input data.
Warning: No categorical columns found. Calling 'transform' will only return input data.
Warning: No categorical columns found. Calling 'transform' will only return input data.
Warning: No categorical columns found. Calling 'transform' will only return input data.
Warning: No categorical columns found. Calling 'transform' will only return input data.
Warning: No categorical columns found. Calling 'transform' will only return input data.
Warning: No categorical columns found. Calling 'transform' will only return input data.
Warning: No categorical columns found. Calling 'transform' will only return input data.
Warning: No categorical columns found. Calling 'transform' will only return input data.
Warning: No categorical columns found. Calling 'transform' will only return input data.
Warning: No categorical columns found. Calling 'transform' will only return input data.
Warning: No categorical columns found. Calling 'transform' will only return input data.
Warning: No categorical columns found. Calling 'transform' will only return input data.
Warning: No categorical columns found. Calling 'transform' will only return input data.
Warning: No categorical columns found. Calling 'transform' will only return input data.

BayesSearchCV(cv=3,
              estimator=Pipeline(steps=[('encoder', TargetEncoder()),
                                        ('clf',
                                         XGBClassifier(base_score=None,
                                                       booster=None,
                                                       callbacks=None,
                                                       colsample_bylevel=None,
                                                       colsample_bynode=None,
                                                       colsample_bytree=None,
                                                       early_stopping_rounds=None,
                                                       enable_categorical=False,
                                                       eval_metric=None,
                                                       feature_types=None,
                                                       gamma=None, gpu_id=None,
                                                       grow_policy=None,
                                                       importance_type=N...
                             'clf__learning_rate': Real(low=0.001, high=1.0, prior='log-uniform', transform='normalize'),
                             'clf__max_depth': Integer(low=2, high=8, prior='uniform', transform='normalize'),
                             'clf__reg_alpha': Real(low=0.0, high=10.0, prior='uniform', transform='normalize'),
                             'clf__reg_lambda': Real(low=0.0, high=10.0, prior='uniform', transform='normalize'),
                             'clf__subsample': Real(low=0.5, high=1.0, prior='uniform', transform='normalize')})

BayesSearchCV

BayesSearchCV(cv=3,
              estimator=Pipeline(steps=[('encoder', TargetEncoder()),
                                        ('clf',
                                         XGBClassifier(base_score=None,
                                                       booster=None,
                                                       callbacks=None,
                                                       colsample_bylevel=None,
                                                       colsample_bynode=None,
                                                       colsample_bytree=None,
                                                       early_stopping_rounds=None,
                                                       enable_categorical=False,
                                                       eval_metric=None,
                                                       feature_types=None,
                                                       gamma=None, gpu_id=None,
                                                       grow_policy=None,
                                                       importance_type=N...
                             'clf__learning_rate': Real(low=0.001, high=1.0, prior='log-uniform', transform='normalize'),
                             'clf__max_depth': Integer(low=2, high=8, prior='uniform', transform='normalize'),
                             'clf__reg_alpha': Real(low=0.0, high=10.0, prior='uniform', transform='normalize'),
                             'clf__reg_lambda': Real(low=0.0, high=10.0, prior='uniform', transform='normalize'),
                             'clf__subsample': Real(low=0.5, high=1.0, prior='uniform', transform='normalize')})

estimator: Pipeline

Pipeline(steps=[('encoder', TargetEncoder()),
                ('clf',
                 XGBClassifier(base_score=None, booster=None, callbacks=None,
                               colsample_bylevel=None, colsample_bynode=None,
                               colsample_bytree=None,
                               early_stopping_rounds=None,
                               enable_categorical=False, eval_metric=None,
                               feature_types=None, gamma=None, gpu_id=None,
                               grow_policy=None, importance_type=None,
                               interaction_constraints=None, learning_rate=None,
                               max_bin=None, max_cat_threshold=None,
                               max_cat_to_onehot=None, max_delta_step=None,
                               max_depth=None, max_leaves=None,
                               min_child_weight=None, missing=nan,
                               monotone_constraints=None, n_estimators=100,
                               n_jobs=None, num_parallel_tree=None,
                               predictor=None, random_state=123, ...))])

TargetEncoder

TargetEncoder()

XGBClassifier

XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=None, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=None, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              n_estimators=100, n_jobs=None, num_parallel_tree=None,
              predictor=None, random_state=123, ...)

Evaluate the model / make predictions

opt.best_estimator_

Pipeline(steps=[('encoder', TargetEncoder(cols=[])),
                ('clf',
                 XGBClassifier(base_score=None, booster=None, callbacks=None,
                               colsample_bylevel=0.9425384185492701,
                               colsample_bynode=0.9095956806239844,
                               colsample_bytree=0.706128679361455,
                               early_stopping_rounds=None,
                               enable_categorical=False, eval_metric=None,
                               feature_types=None, gamma=1.6598135411398998,
                               gpu_id=None, g...icy=None,
                               importance_type=None,
                               interaction_constraints=None,
                               learning_rate=0.7929828265552742, max_bin=None,
                               max_cat_threshold=None, max_cat_to_onehot=None,
                               max_delta_step=None, max_depth=7,
                               max_leaves=None, min_child_weight=None,
                               missing=nan, monotone_constraints=None,
                               n_estimators=100, n_jobs=None,
                               num_parallel_tree=None, predictor=None,
                               random_state=123, ...))])

Pipeline

Pipeline(steps=[('encoder', TargetEncoder(cols=[])),
                ('clf',
                 XGBClassifier(base_score=None, booster=None, callbacks=None,
                               colsample_bylevel=0.9425384185492701,
                               colsample_bynode=0.9095956806239844,
                               colsample_bytree=0.706128679361455,
                               early_stopping_rounds=None,
                               enable_categorical=False, eval_metric=None,
                               feature_types=None, gamma=1.6598135411398998,
                               gpu_id=None, g...icy=None,
                               importance_type=None,
                               interaction_constraints=None,
                               learning_rate=0.7929828265552742, max_bin=None,
                               max_cat_threshold=None, max_cat_to_onehot=None,
                               max_delta_step=None, max_depth=7,
                               max_leaves=None, min_child_weight=None,
                               missing=nan, monotone_constraints=None,
                               n_estimators=100, n_jobs=None,
                               num_parallel_tree=None, predictor=None,
                               random_state=123, ...))])

TargetEncoder

TargetEncoder(cols=[])

XGBClassifier

XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=0.9425384185492701,
              colsample_bynode=0.9095956806239844,
              colsample_bytree=0.706128679361455, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=1.6598135411398998, gpu_id=None, grow_policy=None,
              importance_type=None, interaction_constraints=None,
              learning_rate=0.7929828265552742, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=7, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              n_estimators=100, n_jobs=None, num_parallel_tree=None,
              predictor=None, random_state=123, ...)

opt.best_score_

0.9117809899504422

opt.score(X_test, y_test)

0.9192097148275309

b2 = opt.best_score_ * 100

opt.predict(X_test)

array([0, 0, 0, ..., 0, 0, 0])

opt.predict_proba(X_test)

array([[0.9834009 , 0.01659912],
       [0.86485183, 0.13514817],
       [0.8582463 , 0.14175366],
       ...,
       [0.99780715, 0.00219283],
       [0.95073146, 0.04926857],
       [0.55550265, 0.44449738]], dtype=float32)

Feature importance

opt.best_estimator_.steps

[('encoder', TargetEncoder(cols=[])),
 ('clf',
  XGBClassifier(base_score=None, booster=None, callbacks=None,
                colsample_bylevel=0.9425384185492701,
                colsample_bynode=0.9095956806239844,
                colsample_bytree=0.706128679361455, early_stopping_rounds=None,
                enable_categorical=False, eval_metric=None, feature_types=None,
                gamma=1.6598135411398998, gpu_id=None, grow_policy=None,
                importance_type=None, interaction_constraints=None,
                learning_rate=0.7929828265552742, max_bin=None,
                max_cat_threshold=None, max_cat_to_onehot=None,
                max_delta_step=None, max_depth=7, max_leaves=None,
                min_child_weight=None, missing=nan, monotone_constraints=None,
                n_estimators=100, n_jobs=None, num_parallel_tree=None,
                predictor=None, random_state=123, ...))]

from xgboost import plot_importance

xgboost_step = opt.best_estimator_.steps[1]
xgboost_model = xgboost_step[1]
plot_importance(xgboost_model)

<AxesSubplot: title={'center': 'Feature importance'}, xlabel='F score', ylabel='Features'>

png

# Set the values for number of folds and stopping iterations
n_folds = 5
early_stopping = 10

#create params
params = {'objective': 'binary:logistic',
         'seed':99,
         'eval_metric':'error'}

# Create the DTrain matrix for XGBoost
DTrain = xgb.DMatrix(X_train, label = y_train)

# Create the data frame of cross validations
cv_df = xgb.cv(params, DTrain, num_boost_round = 5, nfold = n_folds,
            early_stopping_rounds = early_stopping)

# Print the cross validations data frame
print(cv_df)

   train-error-mean  train-error-std  test-error-mean  test-error-std
        0.163573         0.001738         0.173267        0.006149
        0.156224         0.002473         0.164780        0.004829
        0.151998         0.000750         0.158777        0.005875
        0.149065         0.001235         0.158294        0.005947
        0.146132         0.001554         0.154637        0.005209

cv = xgb.cv(params, DTrain, num_boost_round = 600, nfold=10,
            shuffle = True)

# Print the first five rows of the CV results data frame
print(cv.head())

# Calculate the mean of the test AUC scores
print(np.mean(cv['test-error-mean']).round(2))

# Plot the test AUC scores for each iteration
plt.plot(cv['test-error-mean'])
plt.title('Test Error Score Over 600 Iterations')
plt.xlabel('Iteration Number')
plt.ylabel('Test Error Score')
plt.show()

   train-error-mean  train-error-std  test-error-mean  test-error-std
        0.164689         0.001560         0.170438        0.007783
        0.156339         0.003456         0.163262        0.006057
        0.152191         0.001917         0.158846        0.007608
        0.148910         0.001316         0.156706        0.008268
        0.146732         0.001538         0.155051        0.006846
13

png

b1_preds = xgboost_model.predict(X_test)

b1_preds

array([0, 0, 0, ..., 0, 0, 0])

target_names = ['Non-Cancel', 'Cancel']
print(classification_report(y_test, b1_preds, target_names=target_names))

              precision    recall  f1-score   support

  Non-Cancel       0.88      0.92      0.90      2436
      Cancel       0.83      0.75      0.78      1188

    accuracy                           0.87      3624
   macro avg       0.85      0.83      0.84      3624
weighted avg       0.86      0.87      0.86      3624

import plotly.express as px
from textwrap import wrap

label = ['Random Forest','K Nearest Neighbours','Logistics Regression', 'XGBoost_1', 'XGBoost_2']
fig = px.pie(labels=label,values=[k,m,p, b1, b2], width = 700,names=label, height = 700)
fig.update_traces(textposition = 'inside', 
                  textinfo = 'percent + label', 
                  hole = 0.50, 
                  #marker = dict(colors = ['#636EFA','#EF553B','#00CC96','#AB63FA'], line = dict(color = 'white', width = 2)))
                  #marker = dict(color_continuous_scale = px.colors.sequential.Viridis, line = dict(color = 'white', width = 2))
                  #marker = dict(colors = px.colors.sequential.Plasma_r, line = dict(color = 'white', width = 2)))
                  #marker = dict(colors =px.colors.sequential.Plasma, line = dict(color = 'white', width = 2)))
                  marker = dict(colors = px.colors.sequential.YlGnBu, line = dict(color = 'white', width = 2)))

fig.update_layout(annotations = [dict(text = 'Performance Comparison', 
                                      x = 0.5, y = 0.5, font_size = 20, showarrow = False, 
                                      font_family = 'monospace',
                                      font_color = 'black')],
                  showlegend = False)

png