Hotel Cancellation EDA & Predictions

Predicting Hotel Cancellations SK-Learn, Neural Networks & XGBOOST

July 02, 2023 · 83 mins read
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
data = pd.read_csv('TrainData.csv')
#booking status is y
data.head()
no_of_adults no_of_children no_of_weekend_nights no_of_week_nights type_of_meal_plan required_car_parking_space room_type_reserved lead_time arrival_year arrival_month arrival_date market_segment_type repeated_guest no_of_previous_cancellations no_of_previous_bookings_not_canceled avg_price_per_room no_of_special_requests booking_status
0 2 0 1 4 0 0 0 118 2017 12 28 1 0 0 0 110.80 2 0
1 2 1 0 2 0 0 0 17 2018 4 14 1 0 0 0 145.00 0 1
2 1 0 1 5 0 0 0 349 2018 10 4 0 0 0 0 96.67 0 1
3 1 0 2 4 0 0 0 69 2018 6 12 0 0 0 0 120.00 0 1
4 2 0 0 4 1 0 0 11 2018 1 20 1 0 0 0 69.50 1 0
data.describe()
no_of_adults no_of_children no_of_weekend_nights no_of_week_nights type_of_meal_plan required_car_parking_space room_type_reserved lead_time arrival_year arrival_month arrival_date market_segment_type repeated_guest no_of_previous_cancellations no_of_previous_bookings_not_canceled avg_price_per_room no_of_special_requests booking_status
count 18137.000000 18137.000000 18137.000000 18137.000000 18137.000000 18137.000000 18137.000000 18137.000000 18137.000000 18137.000000 18137.000000 18137.000000 18137.000000 18137.000000 18137.000000 18137.000000 18137.000000 18137.000000
mean 1.846777 0.107515 0.811104 2.208965 0.318465 0.031648 0.336770 85.377405 2017.820698 7.432762 15.660804 0.806197 0.025087 0.022440 0.151403 103.478868 0.617522 0.327618
std 0.516020 0.408901 0.873470 1.426365 0.629140 0.175066 0.772865 86.611736 0.383616 3.076999 8.772788 0.645972 0.156393 0.370078 1.714135 35.474103 0.787941 0.469357
min 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 2017.000000 1.000000 1.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
25% 2.000000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 17.000000 2018.000000 5.000000 8.000000 0.000000 0.000000 0.000000 0.000000 80.300000 0.000000 0.000000
50% 2.000000 0.000000 1.000000 2.000000 0.000000 0.000000 0.000000 57.000000 2018.000000 8.000000 16.000000 1.000000 0.000000 0.000000 0.000000 99.450000 0.000000 0.000000
75% 2.000000 0.000000 2.000000 3.000000 0.000000 0.000000 0.000000 127.000000 2018.000000 10.000000 23.000000 1.000000 0.000000 0.000000 0.000000 120.270000 1.000000 1.000000
max 4.000000 9.000000 7.000000 17.000000 3.000000 1.000000 6.000000 443.000000 2018.000000 12.000000 31.000000 4.000000 1.000000 13.000000 58.000000 540.000000 5.000000 1.000000
import seaborn as sns
plt.figure(figsize = (14,7))
sns.heatmap(data.corr().round(2), annot =True, cmap='YlGnBu')

#Negative corr (arrival year & month) market segment and lead time
#corr( repeat guest & no previous not canceled / Market segment & repeat guest) num of children / room type
<AxesSubplot: >

png

#check for null data & check data types
#change dates to date format
data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18137 entries, 0 to 18136
Data columns (total 18 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   no_of_adults                          18137 non-null  int64  
 1   no_of_children                        18137 non-null  int64  
 2   no_of_weekend_nights                  18137 non-null  int64  
 3   no_of_week_nights                     18137 non-null  int64  
 4   type_of_meal_plan                     18137 non-null  int64  
 5   required_car_parking_space            18137 non-null  int64  
 6   room_type_reserved                    18137 non-null  int64  
 7   lead_time                             18137 non-null  int64  
 8   arrival_year                          18137 non-null  int64  
 9   arrival_month                         18137 non-null  int64  
 10  arrival_date                          18137 non-null  int64  
 11  market_segment_type                   18137 non-null  int64  
 12  repeated_guest                        18137 non-null  int64  
 13  no_of_previous_cancellations          18137 non-null  int64  
 14  no_of_previous_bookings_not_canceled  18137 non-null  int64  
 15  avg_price_per_room                    18137 non-null  float64
 16  no_of_special_requests                18137 non-null  int64  
 17  booking_status                        18137 non-null  int64  
dtypes: float64(1), int64(17)
memory usage: 2.5 MB
#check for nulls
data.isnull().sum()/len(data)
no_of_adults                            0.0
no_of_children                          0.0
no_of_weekend_nights                    0.0
no_of_week_nights                       0.0
type_of_meal_plan                       0.0
required_car_parking_space              0.0
room_type_reserved                      0.0
lead_time                               0.0
arrival_year                            0.0
arrival_month                           0.0
arrival_date                            0.0
market_segment_type                     0.0
repeated_guest                          0.0
no_of_previous_cancellations            0.0
no_of_previous_bookings_not_canceled    0.0
avg_price_per_room                      0.0
no_of_special_requests                  0.0
booking_status                          0.0
dtype: float64
#check for outliers but only on contiuous numbers
plt.title("Lead_Time", fontdict = {'fontsize': 20})
sns.boxplot(x=data["lead_time"], palette = 'YlGnBu')
<AxesSubplot: title={'center': 'Lead_Time'}, xlabel='lead_time'>

png

#check for outliers but only on contiuous numbers
plt.title("Avg Price Per Room", fontdict = {'fontsize': 20})
sns.boxplot(x=data["avg_price_per_room"],palette = 'YlGnBu')
<AxesSubplot: title={'center': 'Avg Price Per Room'}, xlabel='avg_price_per_room'>

png

sns.jointplot(x='lead_time', y='avg_price_per_room', data = data,palette = 'YlGnBu', kind='kde', fill=True)
<seaborn.axisgrid.JointGrid at 0x24a23e46df0>

png

sns.jointplot(x='lead_time', y='avg_price_per_room', data = data, cmap = 'YlGnBu', kind='hex')
<seaborn.axisgrid.JointGrid at 0x24a23d0ffa0>

png

# sns.pairplot(data)
# penguins = sns.load_dataset("penguins")
# #sns.pairplot(penguins)
# penguins.info()
plt.figure(figsize = (20, 25))
plt.subplot(5,2,3)
sns.kdeplot(x='lead_time', hue='booking_status', palette = 'Set2', fill=True, data=data)

plt.subplot(5,2,4)
sns.kdeplot(x='arrival_month', hue='booking_status', palette = 'Set2', fill=True, data=data)

plt.subplot(5,2,1)
sns.kdeplot(x='arrival_date', hue='booking_status', palette = 'Set2', fill=True, data=data)

plt.subplot(5,2,2)
sns.kdeplot(x = 'booking_status', hue = 'repeated_guest', palette = 'Set2', fill=True, data = data)
<AxesSubplot: xlabel='booking_status', ylabel='Density'>

png

data = data.rename(columns={'arrival_year': 'year', 'arrival_month': 'month', 'arrival_date': 'day'})
data['date'] = pd.to_datetime(data[['year', 'month', 'day']],format='%Y-%m-%d', errors='coerce')
data
no_of_adults no_of_children no_of_weekend_nights no_of_week_nights type_of_meal_plan required_car_parking_space room_type_reserved lead_time year month day market_segment_type repeated_guest no_of_previous_cancellations no_of_previous_bookings_not_canceled avg_price_per_room no_of_special_requests booking_status date
0 2 0 1 4 0 0 0 118 2017 12 28 1 0 0 0 110.80 2 0 2017-12-28
1 2 1 0 2 0 0 0 17 2018 4 14 1 0 0 0 145.00 0 1 2018-04-14
2 1 0 1 5 0 0 0 349 2018 10 4 0 0 0 0 96.67 0 1 2018-10-04
3 1 0 2 4 0 0 0 69 2018 6 12 0 0 0 0 120.00 0 1 2018-06-12
4 2 0 0 4 1 0 0 11 2018 1 20 1 0 0 0 69.50 1 0 2018-01-20
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
18132 1 0 0 2 0 0 0 103 2018 4 19 0 0 0 0 115.00 0 1 2018-04-19
18133 2 0 0 3 0 0 0 129 2018 8 10 1 0 0 0 88.01 1 0 2018-08-10
18134 2 0 0 1 0 0 0 90 2018 7 13 1 0 0 0 105.30 0 1 2018-07-13
18135 2 0 0 3 0 0 0 18 2018 11 10 1 1 0 1 123.33 1 0 2018-11-10
18136 1 0 1 1 0 0 0 159 2018 4 9 0 0 0 0 65.00 0 0 2018-04-09

18137 rows × 19 columns

#make each reservation a 1 to count
data['reservations'] = 1
# # Use GroupBy() to compute the sum
data1 = data.groupby('date').sum()
data1
no_of_adults no_of_children no_of_weekend_nights no_of_week_nights type_of_meal_plan required_car_parking_space room_type_reserved lead_time year month day market_segment_type repeated_guest no_of_previous_cancellations no_of_previous_bookings_not_canceled avg_price_per_room no_of_special_requests booking_status reservations
date
2017-07-01 61 0 4 76 60 0 0 8001 68578 238 34 10 0 0 0 3134.00 3 25 34
2017-07-02 2 0 0 3 0 0 0 79 2017 7 2 1 0 0 0 76.50 1 1 1
2017-07-03 2 0 1 2 0 0 0 80 2017 7 3 1 0 0 0 76.50 1 1 1
2017-07-04 1 0 1 1 0 0 0 106 2017 7 4 1 0 0 0 68.00 0 1 1
2017-07-05 2 0 2 1 1 0 0 71 2017 7 5 1 0 0 0 55.80 0 1 1
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
2018-12-27 164 8 38 268 9 0 31 7408 157404 936 2106 33 1 1 8 6705.57 52 10 78
2018-12-28 73 8 19 140 11 1 22 4043 74666 444 1036 39 1 6 58 4205.69 35 10 37
2018-12-29 108 13 66 171 13 0 29 9223 119062 708 1711 50 0 0 0 6100.70 48 14 59
2018-12-30 98 10 44 100 6 1 23 5506 94846 564 1410 35 0 0 0 6257.65 39 10 47
2018-12-31 51 4 39 42 8 1 7 3194 48432 288 744 16 0 0 0 2947.58 10 7 24

548 rows × 19 columns

from pandas.api.types import CategoricalDtype

cat_type = CategoricalDtype(categories=['Monday', 'Tuesday',
                                       'Wednesday',
                                       'Thursday', 'Friday',
                                       'Saturday', 'Sunday'],
                           ordered=True)
data.index
RangeIndex(start=0, stop=18137, step=1)

def create_features(df, label=None):
    """
    Creates time series features from datetime index.
    """
    df = data1.copy()
    df['date'] = df.index
    df['dayofweek'] = df['date'].dt.dayofweek
    df['weekday'] = df['date'].dt.day_name()
    df['weekday'] = df['weekday'].astype(cat_type)
    df['quarter'] = df['date'].dt.quarter
    df['month'] = df['date'].dt.month
    df['year'] = df['date'].dt.year
    df['dayofyear'] = df['date'].dt.dayofyear
    df['dayofmonth'] = df['date'].dt.day
    df['weekofyear'] = df['date'].dt.isocalendar().week
    #df['weekofyear'] = df['date'].dt.weekofyear
    df['date_offset'] = (df.date.dt.month*100 + df.date.dt.day - 320)%1300
    
    df['season'] = pd.cut(df['date_offset'], [0, 300, 600, 900, 1300],
                         labels=['Spring', 'Summer', 'Fall', 'Winter'])


    X = df[['dayofweek','quarter','month','year',
           'dayofyear','dayofmonth','weekofyear', 'weekday', 'season']]
    if label:
        y = df[label]
        return X, y
    return X

X, y = create_features(data1, label='reservations')

features_and_target = pd.concat([X, y], axis=1)
features_and_target.head(20)
dayofweek quarter month year dayofyear dayofmonth weekofyear weekday season reservations
date
2017-07-01 5 3 7 2017 182 1 26 Saturday Summer 34
2017-07-02 6 3 7 2017 183 2 26 Sunday Summer 1
2017-07-03 0 3 7 2017 184 3 27 Monday Summer 1
2017-07-04 1 3 7 2017 185 4 27 Tuesday Summer 1
2017-07-05 2 3 7 2017 186 5 27 Wednesday Summer 1
2017-07-06 3 3 7 2017 187 6 27 Thursday Summer 6
2017-07-07 4 3 7 2017 188 7 27 Friday Summer 8
2017-07-08 5 3 7 2017 189 8 27 Saturday Summer 3
2017-07-10 0 3 7 2017 191 10 28 Monday Summer 3
2017-07-11 1 3 7 2017 192 11 28 Tuesday Summer 14
2017-07-12 2 3 7 2017 193 12 28 Wednesday Summer 2
2017-07-13 3 3 7 2017 194 13 28 Thursday Summer 6
2017-07-14 4 3 7 2017 195 14 28 Friday Summer 1
2017-07-15 5 3 7 2017 196 15 28 Saturday Summer 4
2017-07-16 6 3 7 2017 197 16 28 Sunday Summer 3
2017-07-17 0 3 7 2017 198 17 29 Monday Summer 29
2017-07-18 1 3 7 2017 199 18 29 Tuesday Summer 11
2017-07-19 2 3 7 2017 200 19 29 Wednesday Summer 2
2017-07-20 3 3 7 2017 201 20 29 Thursday Summer 2
2017-07-21 4 3 7 2017 202 21 29 Friday Summer 1
plt.figure(figsize = (20,25))


plt.subplot(3,2,1)
plt.gca().set_title('Bookings By Month')
sns.countplot(x = 'month', palette = 'YlGnBu', data = data.loc[data['year'] == 2018])


plt.subplot(3,2,2)
plt.gca().set_title('Variable market_segment_type')
sns.countplot(x = 'market_segment_type', palette = 'YlGnBu', data = data.loc[data['year'] == 2018])
#Need to label market segments(Offline, Online, corporate, aviation, complementary)


plt.subplot(3,2,3)
plt.gca().set_title('Variable booking_status')
sns.countplot(x = 'booking_status', palette = 'YlGnBu', data = data.loc[data['year'] == 2018])

plt.subplot(3,2,4)
plt.gca().set_title('Variable no_of_children')
sns.countplot(x = 'no_of_children', palette = 'YlGnBu', data = data.loc[data['year'] == 2018])
<AxesSubplot: title={'center': 'Variable no_of_children'}, xlabel='no_of_children', ylabel='count'>

png

plt.figure(figsize = (20, 25))
plt.suptitle("Booking Status By Month",fontweight="bold", fontsize=20)

plt.subplot(5,2,1)
sns.countplot(x = 'booking_status', hue = 'month', palette = 'YlGnBu', data = data.loc[data['year'] == 2018])
<AxesSubplot: xlabel='booking_status', ylabel='count'>

png

#change calceled from 0 and 1

plt.figure(figsize = (20, 25))
plt.suptitle("Analysis Of Variable booking_status",fontweight="bold", fontsize=20)

plt.subplot(5,2,1)
sns.countplot(x = 'booking_status', hue = 'no_of_adults', palette = 'YlGnBu', data = data.loc[data['year'] == 2018])

plt.subplot(5,2,2)
sns.countplot(x = 'booking_status', hue = 'no_of_children', palette = 'YlGnBu', data = data.loc[data['year'] == 2018])

plt.subplot(5,2,3)
sns.countplot(x = 'booking_status', hue = 'no_of_weekend_nights',palette = 'YlGnBu', data = data.loc[data['year'] == 2018])

plt.subplot(5,2,4)
sns.countplot(x = 'booking_status', hue = 'market_segment_type', palette = 'YlGnBu', data = data.loc[data['year'] == 2018])

plt.subplot(5,2,5)
sns.countplot(x = 'booking_status', hue = 'type_of_meal_plan', palette = 'YlGnBu', data = data.loc[data['year'] == 2018])

plt.subplot(5,2,6)
sns.countplot(x = 'booking_status', hue = 'required_car_parking_space', palette = 'YlGnBu', data = data.loc[data['year'] == 2018])

plt.subplot(5,2,7)
sns.countplot(x = 'booking_status', hue = 'room_type_reserved', palette = 'YlGnBu', data = data.loc[data['year'] == 2018])

plt.subplot(5,2,8)
sns.countplot(x = 'booking_status', hue = 'year', palette = 'YlGnBu', data = data.loc[data['year'] == 2018])
<AxesSubplot: xlabel='booking_status', ylabel='count'>

png

sns.scatterplot(data=data, x="lead_time", y="avg_price_per_room", palette = 'YlGnBu', hue = 'booking_status')
<AxesSubplot: xlabel='lead_time', ylabel='avg_price_per_room'>

png

fig, ax = plt.subplots(figsize=(10,5))
sns.boxplot(data = features_and_target.dropna(),
            x='weekday',
            y='reservations',
            hue='season',
            ax=ax,
            linewidth=1,
            palette='YlGnBu')

ax.set_title('Number of Reservations by Day of Week')
ax.set_xlabel('Day of Week')
ax.set_ylabel('reservations')
ax.legend(bbox_to_anchor=(1,1))
plt.show()

png


def create_features(df, label=None):
    """
    Creates time series features from datetime index.
    """
    df = data1.copy()
    df['date'] = df.index
    df['dayofweek'] = df['date'].dt.dayofweek
    df['weekday'] = df['date'].dt.day_name()
    df['weekday'] = df['weekday'].astype(cat_type)
    df['quarter'] = df['date'].dt.quarter
    df['month'] = df['date'].dt.month
    df['year'] = df['date'].dt.year
    df['dayofyear'] = df['date'].dt.dayofyear
    df['dayofmonth'] = df['date'].dt.day
    df['weekofyear'] = df['date'].dt.isocalendar().week
    #df['weekofyear'] = df['date'].dt.weekofyear
    df['date_offset'] = (df.date.dt.month*100 + df.date.dt.day - 320)%1300
    
    df['season'] = pd.cut(df['date_offset'], [0, 300, 600, 900, 1300],
                         labels=['Spring', 'Summer', 'Fall', 'Winter'])


    X = df[['dayofweek','quarter','month','year',
           'dayofyear','dayofmonth','weekofyear', 'weekday', 'season']]
    if label:
        y = df[label]
        return X, y
    return X

X, y = create_features(data1, label='booking_status')

features_and_target = pd.concat([X, y], axis=1)
features_and_target.head(20)
dayofweek quarter month year dayofyear dayofmonth weekofyear weekday season booking_status
date
2017-07-01 5 3 7 2017 182 1 26 Saturday Summer 25
2017-07-02 6 3 7 2017 183 2 26 Sunday Summer 1
2017-07-03 0 3 7 2017 184 3 27 Monday Summer 1
2017-07-04 1 3 7 2017 185 4 27 Tuesday Summer 1
2017-07-05 2 3 7 2017 186 5 27 Wednesday Summer 1
2017-07-06 3 3 7 2017 187 6 27 Thursday Summer 6
2017-07-07 4 3 7 2017 188 7 27 Friday Summer 6
2017-07-08 5 3 7 2017 189 8 27 Saturday Summer 1
2017-07-10 0 3 7 2017 191 10 28 Monday Summer 2
2017-07-11 1 3 7 2017 192 11 28 Tuesday Summer 4
2017-07-12 2 3 7 2017 193 12 28 Wednesday Summer 2
2017-07-13 3 3 7 2017 194 13 28 Thursday Summer 5
2017-07-14 4 3 7 2017 195 14 28 Friday Summer 1
2017-07-15 5 3 7 2017 196 15 28 Saturday Summer 4
2017-07-16 6 3 7 2017 197 16 28 Sunday Summer 3
2017-07-17 0 3 7 2017 198 17 29 Monday Summer 15
2017-07-18 1 3 7 2017 199 18 29 Tuesday Summer 10
2017-07-19 2 3 7 2017 200 19 29 Wednesday Summer 2
2017-07-20 3 3 7 2017 201 20 29 Thursday Summer 1
2017-07-21 4 3 7 2017 202 21 29 Friday Summer 1
fig, ax = plt.subplots(figsize=(10,5))
sns.boxplot(data = features_and_target.dropna(),
            x='weekday',
            y='booking_status',
            hue='season',
            ax=ax,
            linewidth=1,
            palette='YlGnBu')

ax.set_title('Number of Cancelations by Day of Week')
ax.set_xlabel('Day of Week')
ax.set_ylabel('Cancelations')
ax.legend(bbox_to_anchor=(1,1))
plt.show()

png

data_2018 = features_and_target.loc[features_and_target['year'] == 2018]
data_2018.head()
dayofweek quarter month year dayofyear dayofmonth weekofyear weekday season booking_status
date
2018-01-01 0 1 1 2018 1 1 1 Monday Winter 0
2018-01-02 1 1 1 2018 2 2 1 Tuesday Winter 0
2018-01-03 2 1 1 2018 3 3 1 Wednesday Winter 0
2018-01-04 3 1 1 2018 4 4 1 Thursday Winter 0
2018-01-05 4 1 1 2018 5 5 1 Friday Winter 0
fig, ax = plt.subplots(figsize=(10,5))
sns.boxplot(data = data_2018.dropna(),
            x='weekday',
            y='booking_status',
            hue='season',
            ax=ax,
            linewidth=1,
            palette='YlGnBu')

ax.set_title('Number of Cancelations by Day of Week 2018 Only')
ax.set_xlabel('Day of Week')
ax.set_ylabel('Cancelations')
ax.legend(bbox_to_anchor=(1,1))
plt.show()

png

fig, ax = plt.subplots(figsize=(15,5))
sns.boxplot(data = data_2018.dropna(),
            x='weekofyear',
            y='booking_status',
            hue='season',
            ax=ax,
            linewidth=1,
            palette='YlGnBu')

ax.set_title('Number of Cancelations by Day of Week 2018 Only')
ax.set_xlabel('Day of Week')
ax.set_ylabel('Cancelations')
ax.legend(bbox_to_anchor=(1,1))
plt.show()

png

date_data = features_and_target.copy()
data1 = data.merge(date_data, on = 'date')
data1.head()
no_of_adults no_of_children no_of_weekend_nights no_of_week_nights type_of_meal_plan required_car_parking_space room_type_reserved lead_time year_x month_x ... dayofweek quarter month_y year_y dayofyear dayofmonth weekofyear weekday season booking_status_y
0 2 0 1 4 0 0 0 118 2017 12 ... 3 4 12 2017 362 28 52 Thursday Winter 2
1 2 0 1 1 0 0 0 21 2017 12 ... 3 4 12 2017 362 28 52 Thursday Winter 2
2 2 0 1 1 0 1 2 42 2017 12 ... 3 4 12 2017 362 28 52 Thursday Winter 2
3 2 0 1 3 1 0 0 61 2017 12 ... 3 4 12 2017 362 28 52 Thursday Winter 2
4 2 2 1 0 1 0 3 35 2017 12 ... 3 4 12 2017 362 28 52 Thursday Winter 2

5 rows × 30 columns

data1.describe().T
count mean std min 25% 50% 75% max
no_of_adults 18116.0 1.847262 0.51574 0.0 2.0 2.0 2.0 4.0
no_of_children 18116.0 0.107474 0.408828 0.0 0.0 0.0 0.0 9.0
no_of_weekend_nights 18116.0 0.810775 0.873802 0.0 0.0 1.0 2.0 7.0
no_of_week_nights 18116.0 2.208931 1.426151 0.0 1.0 2.0 3.0 17.0
type_of_meal_plan 18116.0 0.318613 0.629172 0.0 0.0 0.0 0.0 3.0
required_car_parking_space 18116.0 0.031629 0.175016 0.0 0.0 0.0 0.0 1.0
room_type_reserved 18116.0 0.336553 0.772457 0.0 0.0 0.0 0.0 6.0
lead_time 18116.0 85.433429 86.63484 0.0 17.0 57.0 127.0 443.0
year_x 18116.0 2017.82049 0.383789 2017.0 2018.0 2018.0 2018.0 2018.0
month_x 18116.0 7.439059 3.073214 1.0 5.0 8.0 10.0 12.0
day 18116.0 15.645341 8.7661 1.0 8.0 16.0 23.0 31.0
market_segment_type 18116.0 0.805917 0.645484 0.0 0.0 1.0 1.0 4.0
repeated_guest 18116.0 0.024895 0.15581 0.0 0.0 0.0 0.0 1.0
no_of_previous_cancellations 18116.0 0.022411 0.370221 0.0 0.0 0.0 0.0 13.0
no_of_previous_bookings_not_canceled 18116.0 0.150364 1.711651 0.0 0.0 0.0 0.0 58.0
avg_price_per_room 18116.0 103.498054 35.461471 0.0 80.3 99.45 120.285 540.0
no_of_special_requests 18116.0 0.617852 0.788105 0.0 0.0 0.0 1.0 5.0
booking_status_x 18116.0 0.327832 0.469436 0.0 0.0 0.0 1.0 1.0
reservations 18116.0 1.0 0.0 1.0 1.0 1.0 1.0 1.0
dayofweek 18116.0 3.090197 2.062916 0.0 1.0 3.0 5.0 6.0
quarter 18116.0 2.801391 1.033189 1.0 2.0 3.0 4.0 4.0
month_y 18116.0 7.439059 3.073214 1.0 5.0 8.0 10.0 12.0
year_y 18116.0 2017.82049 0.383789 2017.0 2018.0 2018.0 2018.0 2018.0
dayofyear 18116.0 210.667311 93.777748 1.0 135.0 226.0 286.0 365.0
dayofmonth 18116.0 15.645341 8.7661 1.0 8.0 16.0 23.0 31.0
weekofyear 18116.0 30.416483 13.389291 1.0 20.0 33.0 41.0 52.0
booking_status_y 18116.0 17.444138 15.874724 0.0 6.0 14.0 23.0 85.0
data1['booking_status_x']
0        0
1        0
2        0
3        0
4        0
        ..
18111    1
18112    0
18113    1
18114    0
18115    1
Name: booking_status_x, Length: 18116, dtype: int64
#inspect correlation between label and features
print(data1.corr()["booking_status_x"].sort_values(ascending=False))
booking_status_x                        1.000000
lead_time                               0.434283
booking_status_y                        0.355568
year_y                                  0.183568
year_x                                  0.183568
avg_price_per_room                      0.145339
no_of_week_nights                       0.096321
no_of_adults                            0.093965
type_of_meal_plan                       0.076771
no_of_weekend_nights                    0.061341
no_of_children                          0.035009
dayofweek                               0.028634
room_type_reserved                      0.021954
day                                     0.012104
dayofmonth                              0.012104
quarter                                 0.000175
weekofyear                             -0.011193
dayofyear                              -0.011705
month_y                                -0.012305
month_x                                -0.012305
no_of_previous_cancellations           -0.032113
market_segment_type                    -0.045607
no_of_previous_bookings_not_canceled   -0.060390
required_car_parking_space             -0.092620
repeated_guest                         -0.107060
no_of_special_requests                 -0.248649
reservations                                 NaN
Name: booking_status_x, dtype: float64


C:\Users\Brett\AppData\Local\Temp\ipykernel_21676\86188331.py:2: FutureWarning: The default value of numeric_only in DataFrame.corr is deprecated. In a future version, it will default to False. Select only valid columns or specify the value of numeric_only to silence this warning.
  print(data1.corr()["booking_status_x"].sort_values(ascending=False))
data1 = data1.drop(['reservations', 'booking_status_y'], axis = 1)
data1.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 18116 entries, 0 to 18115
Data columns (total 28 columns):
 #   Column                                Non-Null Count  Dtype         
---  ------                                --------------  -----         
 0   no_of_adults                          18116 non-null  int64         
 1   no_of_children                        18116 non-null  int64         
 2   no_of_weekend_nights                  18116 non-null  int64         
 3   no_of_week_nights                     18116 non-null  int64         
 4   type_of_meal_plan                     18116 non-null  int64         
 5   required_car_parking_space            18116 non-null  int64         
 6   room_type_reserved                    18116 non-null  int64         
 7   lead_time                             18116 non-null  int64         
 8   year_x                                18116 non-null  int64         
 9   month_x                               18116 non-null  int64         
 10  day                                   18116 non-null  int64         
 11  market_segment_type                   18116 non-null  int64         
 12  repeated_guest                        18116 non-null  int64         
 13  no_of_previous_cancellations          18116 non-null  int64         
 14  no_of_previous_bookings_not_canceled  18116 non-null  int64         
 15  avg_price_per_room                    18116 non-null  float64       
 16  no_of_special_requests                18116 non-null  int64         
 17  booking_status_x                      18116 non-null  int64         
 18  date                                  18116 non-null  datetime64[ns]
 19  dayofweek                             18116 non-null  int64         
 20  quarter                               18116 non-null  int64         
 21  month_y                               18116 non-null  int64         
 22  year_y                                18116 non-null  int64         
 23  dayofyear                             18116 non-null  int64         
 24  dayofmonth                            18116 non-null  int64         
 25  weekofyear                            18116 non-null  UInt32        
 26  weekday                               18116 non-null  category      
 27  season                                18052 non-null  category      
dtypes: UInt32(1), category(2), datetime64[ns](1), float64(1), int64(23)
memory usage: 3.7 MB
X = data1[['avg_price_per_room','no_of_special_requests','market_segment_type','lead_time','required_car_parking_space']]
y = data1['booking_status_x']
print(X.head(4))
print(y.head(4))
   avg_price_per_room  no_of_special_requests  market_segment_type  lead_time  \
0               110.8                       2                    1        118   
1               107.0                       2                    1         21   
2                91.5                       0                    1         42   
3                92.5                       1                    1         61   

   required_car_parking_space  
0                           0  
1                           0  
2                           1  
3                           0  
0    0
1    0
2    0
3    0
Name: booking_status_x, dtype: int64
#random forest model
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
#random forest model
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,random_state=101)
rfc = RandomForestClassifier(n_estimators=400)
rfc.fit(X_train, y_train)
rfc_pred = rfc.predict(X_test)
print(accuracy_score(y_test,rfc_pred)*100)
k=accuracy_score(y_test,rfc_pred)*100
print('\n')
print(confusion_matrix(y_test,rfc_pred))
print('\n')
print(classification_report(y_test,rfc_pred))
85.22539098436063


[[3313  339]
 [ 464 1319]]


              precision    recall  f1-score   support

           0       0.88      0.91      0.89      3652
           1       0.80      0.74      0.77      1783

    accuracy                           0.85      5435
   macro avg       0.84      0.82      0.83      5435
weighted avg       0.85      0.85      0.85      5435
#KNN
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix

scale= StandardScaler()
scale.fit(X)
scaled_features = scale.transform(X)
df_feat = pd.DataFrame(X)
#X = df[['avg_price_per_room','no_of_special_requests','market_segment_type','arrival_month','lead_time']]
y = data1['booking_status_x']
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.80,random_state=101)
knn = KNeighborsClassifier(n_neighbors=1)
pred = knn.fit(X_train,y_train)
pred = knn.predict(X_test)
print('KNN Accuracy score is: ',accuracy_score(y_test,pred)*100)
print('\n')
print(confusion_matrix(y_test,pred))
print('\n')
print(classification_report(y_test,pred))
KNN Accuracy score is:  74.27723728696613


[[7752 2017]
 [1711 3013]]


              precision    recall  f1-score   support

           0       0.82      0.79      0.81      9769
           1       0.60      0.64      0.62      4724

    accuracy                           0.74     14493
   macro avg       0.71      0.72      0.71     14493
weighted avg       0.75      0.74      0.74     14493
plt.figure(figsize=(14,6))
error = []
for i in range(1,20):
    knn = KNeighborsClassifier(n_neighbors=i)
    knn.fit(X_train,y_train)
    pred_i = knn.predict(X_test)
    error.append(np.mean(pred_i != y_test))

sns.set_style('whitegrid')
plt.plot(range(1,20),error,color='blue', linestyle='dashed', marker='o',
         markerfacecolor='red', markersize=10)
plt.title('Error Rate vs. K Value')
plt.xlabel('K')
plt.ylabel('Error Rate')
Text(0, 0.5, 'Error Rate')

png

knn = KNeighborsClassifier(n_neighbors=2)
pred = knn.fit(X_train,y_train)
pred = knn.predict(X_test)
#print('LogisticRegression score is: ',np.round(model.score(y_test,pred)*100,decimals=2))
print('\n')
print('Best KNN Accuracy score is: ',accuracy_score(y_test,pred)*100)
print('\n')
m=accuracy_score(y_test,pred)*100
print(confusion_matrix(y_test,pred))
print('\n')
print(classification_report(y_test,pred))
Best KNN Accuracy score is:  77.95487476712896


[[9052  717]
 [2478 2246]]


              precision    recall  f1-score   support

           0       0.79      0.93      0.85      9769
           1       0.76      0.48      0.58      4724

    accuracy                           0.78     14493
   macro avg       0.77      0.70      0.72     14493
weighted avg       0.78      0.78      0.76     14493
#logistical Model

from sklearn.linear_model import LogisticRegression

y = data1['booking_status_x']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.40 ,random_state=101)
logmodel = LogisticRegression()
logmodel.fit(X_train,y_train)
predictions = logmodel.predict(X_test)
print('\n')
print('Accuracy score is: ',accuracy_score(y_test,predictions)*100)
p=accuracy_score(y_test,predictions)*100

print('\n')
print(confusion_matrix(y_test,predictions))
print('\n')
print(classification_report(y_test,predictions))
Accuracy score is:  78.52904650200084


[[4359  522]
 [1034 1332]]


              precision    recall  f1-score   support

           0       0.81      0.89      0.85      4881
           1       0.72      0.56      0.63      2366

    accuracy                           0.79      7247
   macro avg       0.76      0.73      0.74      7247
weighted avg       0.78      0.79      0.78      7247
sns.set_context("poster", font_scale = .75)
cm = confusion_matrix(y_test, predictions)
fig, ax = plt.subplots(figsize=(6, 5))
ax.imshow(cm)
ax.grid(False)
ax.xaxis.set(ticks=(0, 1), ticklabels=('Predicted 0s', 'Predicted 1s'))
ax.yaxis.set(ticks=(0, 1), ticklabels=('Actual 0s', 'Actual 1s'))
ax.set_ylim(1.5, -0.5)
for i in range(2):
    for j in range(2):
        ax.text(j, i, cm[i, j], ha='center', va='center', color='red')
plt.show()

png

import plotly.express as px
from plotly import graph_objects
from textwrap import wrap
import chart_studio.plotly as py

named_colorscales = px.colors.named_colorscales()
print("\n".join(wrap("".join('{:<12}'.format(c) for c in named_colorscales), 96)))
aggrnyl     agsunset    blackbody   bluered     blues       blugrn      bluyl       brwnyl
bugn        bupu        burg        burgyl      cividis     darkmint    electric    emrld
gnbu        greens      greys       hot         inferno     jet         magenta     magma
mint        orrd        oranges     oryel       peach       pinkyl      plasma      plotly3
pubu        pubugn      purd        purp        purples     purpor      rainbow     rdbu
rdpu        redor       reds        sunset      sunsetdark  teal        tealgrn     turbo
viridis     ylgn        ylgnbu      ylorbr      ylorrd      algae       amp         deep
dense       gray        haline      ice         matter      solar       speed       tempo
thermal     turbid      armyrose    brbg        earth       fall        geyser      prgn
piyg        picnic      portland    puor        rdgy        rdylbu      rdylgn      spectral
tealrose    temps       tropic      balance     curl        delta       oxy         edge
hsv         icefire     phase       twilight    mrybm       mygbm
!pip install chart-studio
Requirement already satisfied: chart-studio in c:\users\brett\anaconda3\envs\school\lib\site-packages (1.1.0)
Requirement already satisfied: plotly in c:\users\brett\anaconda3\envs\school\lib\site-packages (from chart-studio) (5.13.0)
Requirement already satisfied: requests in c:\users\brett\anaconda3\envs\school\lib\site-packages (from chart-studio) (2.28.1)
Requirement already satisfied: six in c:\users\brett\anaconda3\envs\school\lib\site-packages (from chart-studio) (1.16.0)
Requirement already satisfied: retrying>=1.3.3 in c:\users\brett\anaconda3\envs\school\lib\site-packages (from chart-studio) (1.3.4)
Requirement already satisfied: tenacity>=6.2.0 in c:\users\brett\anaconda3\envs\school\lib\site-packages (from plotly->chart-studio) (8.2.0)
Requirement already satisfied: urllib3<1.27,>=1.21.1 in c:\users\brett\anaconda3\envs\school\lib\site-packages (from requests->chart-studio) (1.26.14)
Requirement already satisfied: certifi>=2017.4.17 in c:\users\brett\anaconda3\envs\school\lib\site-packages (from requests->chart-studio) (2022.12.7)
Requirement already satisfied: idna<4,>=2.5 in c:\users\brett\anaconda3\envs\school\lib\site-packages (from requests->chart-studio) (3.4)
Requirement already satisfied: charset-normalizer<3,>=2 in c:\users\brett\anaconda3\envs\school\lib\site-packages (from requests->chart-studio) (2.0.4)

xgboost

import xgboost as xgb
from sklearn.model_selection import train_test_split

xg_cl = xgb.XGBClassifier(objective = 'binary:logistic', n_estimators = 10, seed = 123)
xg_cl.fit(X_train, y_train)

preds = xg_cl.predict(X_test)
accuracy = float(np.sum(preds==y_test))/y_test.shape[0]
b1 = accuracy*100
print('accuracy: %f' %(accuracy))
accuracy: 0.844901
import warnings
warnings.filterwarnings('ignore')
#inspect correlation between label and features


print(data1.corr()["booking_status_x"].sort_values(ascending=False))
booking_status_x                        1.000000
lead_time                               0.434283
year_y                                  0.183568
year_x                                  0.183568
avg_price_per_room                      0.145339
no_of_week_nights                       0.096321
no_of_adults                            0.093965
type_of_meal_plan                       0.076771
no_of_weekend_nights                    0.061341
no_of_children                          0.035009
dayofweek                               0.028634
room_type_reserved                      0.021954
day                                     0.012104
dayofmonth                              0.012104
quarter                                 0.000175
weekofyear                             -0.011193
dayofyear                              -0.011705
month_y                                -0.012305
month_x                                -0.012305
no_of_previous_cancellations           -0.032113
market_segment_type                    -0.045607
no_of_previous_bookings_not_canceled   -0.060390
required_car_parking_space             -0.092620
repeated_guest                         -0.107060
no_of_special_requests                 -0.248649
Name: booking_status_x, dtype: float64
X = data1[['avg_price_per_room',
           'no_of_special_requests',
           'market_segment_type',
           'lead_time',
           'required_car_parking_space',
          'repeated_guest',
          'no_of_week_nights',
          'no_of_adults']]
y = data1['booking_status_x']
print(X.head(4))
print(y.head(4))
   avg_price_per_room  no_of_special_requests  market_segment_type  lead_time  \
0               110.8                       2                    1        118   
1               107.0                       2                    1         21   
2                91.5                       0                    1         42   
3                92.5                       1                    1         61   

   required_car_parking_space  repeated_guest  no_of_week_nights  no_of_adults  
0                           0               0                  4             2  
1                           0               0                  1             2  
2                           1               0                  1             2  
3                           0               0                  3             2  
0    0
1    0
2    0
3    0
Name: booking_status_x, dtype: int64
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, stratify = y, random_state = 123)

Build Pipeline

!pip install category-encoders
Requirement already satisfied: category-encoders in c:\users\brett\anaconda3\envs\school\lib\site-packages (2.6.0)
Requirement already satisfied: scipy>=1.0.0 in c:\users\brett\anaconda3\envs\school\lib\site-packages (from category-encoders) (1.10.0)
Requirement already satisfied: patsy>=0.5.1 in c:\users\brett\anaconda3\envs\school\lib\site-packages (from category-encoders) (0.5.3)
Requirement already satisfied: pandas>=1.0.5 in c:\users\brett\anaconda3\envs\school\lib\site-packages (from category-encoders) (1.5.3)
Requirement already satisfied: numpy>=1.14.0 in c:\users\brett\anaconda3\envs\school\lib\site-packages (from category-encoders) (1.23.5)
Requirement already satisfied: statsmodels>=0.9.0 in c:\users\brett\anaconda3\envs\school\lib\site-packages (from category-encoders) (0.13.5)
Requirement already satisfied: scikit-learn>=0.20.0 in c:\users\brett\anaconda3\envs\school\lib\site-packages (from category-encoders) (1.2.1)
Requirement already satisfied: python-dateutil>=2.8.1 in c:\users\brett\anaconda3\envs\school\lib\site-packages (from pandas>=1.0.5->category-encoders) (2.8.2)
Requirement already satisfied: pytz>=2020.1 in c:\users\brett\anaconda3\envs\school\lib\site-packages (from pandas>=1.0.5->category-encoders) (2022.7)
Requirement already satisfied: six in c:\users\brett\anaconda3\envs\school\lib\site-packages (from patsy>=0.5.1->category-encoders) (1.16.0)
Requirement already satisfied: joblib>=1.1.1 in c:\users\brett\anaconda3\envs\school\lib\site-packages (from scikit-learn>=0.20.0->category-encoders) (1.2.0)
Requirement already satisfied: threadpoolctl>=2.0.0 in c:\users\brett\anaconda3\envs\school\lib\site-packages (from scikit-learn>=0.20.0->category-encoders) (3.1.0)
Requirement already satisfied: packaging>=21.3 in c:\users\brett\anaconda3\envs\school\lib\site-packages (from statsmodels>=0.9.0->category-encoders) (22.0)
conda install -c anaconda py-xgboost
Retrieving notices: ...working... done
Collecting package metadata (current_repodata.json): ...working... done
Solving environment: ...working... done

# All requested packages already installed.


Note: you may need to restart the kernel to use updated packages.




==> WARNING: A newer version of conda exists. <==
  current version: 23.1.0
  latest version: 23.7.3

Please update conda by running

    $ conda update -n base -c defaults conda

Or to minimize the number of packages updated during conda update use

     conda install conda=23.7.3
from sklearn.pipeline import Pipeline
from category_encoders.target_encoder import TargetEncoder
#from xgboost import XGBclassifier
import xgboost as xgb
#from xgboost import XGBclassifier

estimators = [
    ('encoder', TargetEncoder()),
    ('clf', xgb.XGBClassifier(random_state = 123))
]
pipe = Pipeline(steps = estimators)
pipe
Pipeline(steps=[('encoder', TargetEncoder()),
                ('clf',
                 XGBClassifier(base_score=None, booster=None, callbacks=None,
                               colsample_bylevel=None, colsample_bynode=None,
                               colsample_bytree=None,
                               early_stopping_rounds=None,
                               enable_categorical=False, eval_metric=None,
                               feature_types=None, gamma=None, gpu_id=None,
                               grow_policy=None, importance_type=None,
                               interaction_constraints=None, learning_rate=None,
                               max_bin=None, max_cat_threshold=None,
                               max_cat_to_onehot=None, max_delta_step=None,
                               max_depth=None, max_leaves=None,
                               min_child_weight=None, missing=nan,
                               monotone_constraints=None, n_estimators=100,
                               n_jobs=None, num_parallel_tree=None,
                               predictor=None, random_state=123, ...))])
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.

Set up hyperparameter tuning

!pip install scikit-optimize
Requirement already satisfied: scikit-optimize in c:\users\brett\anaconda3\envs\school\lib\site-packages (0.9.0)
Requirement already satisfied: joblib>=0.11 in c:\users\brett\anaconda3\envs\school\lib\site-packages (from scikit-optimize) (1.2.0)
Requirement already satisfied: pyaml>=16.9 in c:\users\brett\anaconda3\envs\school\lib\site-packages (from scikit-optimize) (21.10.1)
Requirement already satisfied: scikit-learn>=0.20.0 in c:\users\brett\anaconda3\envs\school\lib\site-packages (from scikit-optimize) (1.2.1)
Requirement already satisfied: numpy>=1.13.3 in c:\users\brett\anaconda3\envs\school\lib\site-packages (from scikit-optimize) (1.23.5)
Requirement already satisfied: scipy>=0.19.1 in c:\users\brett\anaconda3\envs\school\lib\site-packages (from scikit-optimize) (1.10.0)
Requirement already satisfied: PyYAML in c:\users\brett\anaconda3\envs\school\lib\site-packages (from pyaml>=16.9->scikit-optimize) (6.0)
Requirement already satisfied: threadpoolctl>=2.0.0 in c:\users\brett\anaconda3\envs\school\lib\site-packages (from scikit-learn>=0.20.0->scikit-optimize) (3.1.0)
from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer

search_space = {
    'clf__max_depth': Integer(2,8),
    'clf__learning_rate': Real(0.001, 1.0, prior='log-uniform'),
    'clf__subsample': Real(0.5, 1.0),
    'clf__colsample_bytree': Real(0.5, 1.0),
    'clf__colsample_bylevel': Real(0.5, 1.0),
    'clf__colsample_bynode' : Real(0.5, 1.0),
    'clf__reg_alpha': Real(0.0, 10.0),
    'clf__reg_lambda': Real(0.0, 10.0),
    'clf__gamma': Real(0.0, 10.0)
}

opt = BayesSearchCV(pipe, search_space, cv=3, n_iter=10, scoring='roc_auc', random_state=8) 
#train xgboost model
opt.fit(X_train, y_train)
Warning: No categorical columns found. Calling 'transform' will only return input data.
Warning: No categorical columns found. Calling 'transform' will only return input data.
Warning: No categorical columns found. Calling 'transform' will only return input data.
Warning: No categorical columns found. Calling 'transform' will only return input data.
Warning: No categorical columns found. Calling 'transform' will only return input data.
Warning: No categorical columns found. Calling 'transform' will only return input data.
Warning: No categorical columns found. Calling 'transform' will only return input data.
Warning: No categorical columns found. Calling 'transform' will only return input data.
Warning: No categorical columns found. Calling 'transform' will only return input data.
Warning: No categorical columns found. Calling 'transform' will only return input data.
Warning: No categorical columns found. Calling 'transform' will only return input data.
Warning: No categorical columns found. Calling 'transform' will only return input data.
Warning: No categorical columns found. Calling 'transform' will only return input data.
Warning: No categorical columns found. Calling 'transform' will only return input data.
Warning: No categorical columns found. Calling 'transform' will only return input data.
Warning: No categorical columns found. Calling 'transform' will only return input data.
Warning: No categorical columns found. Calling 'transform' will only return input data.
Warning: No categorical columns found. Calling 'transform' will only return input data.
Warning: No categorical columns found. Calling 'transform' will only return input data.
Warning: No categorical columns found. Calling 'transform' will only return input data.
Warning: No categorical columns found. Calling 'transform' will only return input data.
Warning: No categorical columns found. Calling 'transform' will only return input data.
Warning: No categorical columns found. Calling 'transform' will only return input data.
Warning: No categorical columns found. Calling 'transform' will only return input data.
Warning: No categorical columns found. Calling 'transform' will only return input data.
Warning: No categorical columns found. Calling 'transform' will only return input data.
Warning: No categorical columns found. Calling 'transform' will only return input data.
Warning: No categorical columns found. Calling 'transform' will only return input data.
Warning: No categorical columns found. Calling 'transform' will only return input data.
Warning: No categorical columns found. Calling 'transform' will only return input data.
Warning: No categorical columns found. Calling 'transform' will only return input data.
BayesSearchCV(cv=3,
              estimator=Pipeline(steps=[('encoder', TargetEncoder()),
                                        ('clf',
                                         XGBClassifier(base_score=None,
                                                       booster=None,
                                                       callbacks=None,
                                                       colsample_bylevel=None,
                                                       colsample_bynode=None,
                                                       colsample_bytree=None,
                                                       early_stopping_rounds=None,
                                                       enable_categorical=False,
                                                       eval_metric=None,
                                                       feature_types=None,
                                                       gamma=None, gpu_id=None,
                                                       grow_policy=None,
                                                       importance_type=N...
                             'clf__learning_rate': Real(low=0.001, high=1.0, prior='log-uniform', transform='normalize'),
                             'clf__max_depth': Integer(low=2, high=8, prior='uniform', transform='normalize'),
                             'clf__reg_alpha': Real(low=0.0, high=10.0, prior='uniform', transform='normalize'),
                             'clf__reg_lambda': Real(low=0.0, high=10.0, prior='uniform', transform='normalize'),
                             'clf__subsample': Real(low=0.5, high=1.0, prior='uniform', transform='normalize')})
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.

Evaluate the model / make predictions

opt.best_estimator_
Pipeline(steps=[('encoder', TargetEncoder(cols=[])),
                ('clf',
                 XGBClassifier(base_score=None, booster=None, callbacks=None,
                               colsample_bylevel=0.9425384185492701,
                               colsample_bynode=0.9095956806239844,
                               colsample_bytree=0.706128679361455,
                               early_stopping_rounds=None,
                               enable_categorical=False, eval_metric=None,
                               feature_types=None, gamma=1.6598135411398998,
                               gpu_id=None, g...icy=None,
                               importance_type=None,
                               interaction_constraints=None,
                               learning_rate=0.7929828265552742, max_bin=None,
                               max_cat_threshold=None, max_cat_to_onehot=None,
                               max_delta_step=None, max_depth=7,
                               max_leaves=None, min_child_weight=None,
                               missing=nan, monotone_constraints=None,
                               n_estimators=100, n_jobs=None,
                               num_parallel_tree=None, predictor=None,
                               random_state=123, ...))])
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
opt.best_score_
0.9117809899504422
opt.score(X_test, y_test)
0.9192097148275309
b2 = opt.best_score_ * 100
opt.predict(X_test)
array([0, 0, 0, ..., 0, 0, 0])
opt.predict_proba(X_test)
array([[0.9834009 , 0.01659912],
       [0.86485183, 0.13514817],
       [0.8582463 , 0.14175366],
       ...,
       [0.99780715, 0.00219283],
       [0.95073146, 0.04926857],
       [0.55550265, 0.44449738]], dtype=float32)

Feature importance

opt.best_estimator_.steps
[('encoder', TargetEncoder(cols=[])),
 ('clf',
  XGBClassifier(base_score=None, booster=None, callbacks=None,
                colsample_bylevel=0.9425384185492701,
                colsample_bynode=0.9095956806239844,
                colsample_bytree=0.706128679361455, early_stopping_rounds=None,
                enable_categorical=False, eval_metric=None, feature_types=None,
                gamma=1.6598135411398998, gpu_id=None, grow_policy=None,
                importance_type=None, interaction_constraints=None,
                learning_rate=0.7929828265552742, max_bin=None,
                max_cat_threshold=None, max_cat_to_onehot=None,
                max_delta_step=None, max_depth=7, max_leaves=None,
                min_child_weight=None, missing=nan, monotone_constraints=None,
                n_estimators=100, n_jobs=None, num_parallel_tree=None,
                predictor=None, random_state=123, ...))]
from xgboost import plot_importance

xgboost_step = opt.best_estimator_.steps[1]
xgboost_model = xgboost_step[1]
plot_importance(xgboost_model)
<AxesSubplot: title={'center': 'Feature importance'}, xlabel='F score', ylabel='Features'>

png

# Set the values for number of folds and stopping iterations
n_folds = 5
early_stopping = 10

#create params
params = {'objective': 'binary:logistic',
         'seed':99,
         'eval_metric':'error'}

# Create the DTrain matrix for XGBoost
DTrain = xgb.DMatrix(X_train, label = y_train)

# Create the data frame of cross validations
cv_df = xgb.cv(params, DTrain, num_boost_round = 5, nfold = n_folds,
            early_stopping_rounds = early_stopping)

# Print the cross validations data frame
print(cv_df)
   train-error-mean  train-error-std  test-error-mean  test-error-std
0          0.163573         0.001738         0.173267        0.006149
1          0.156224         0.002473         0.164780        0.004829
2          0.151998         0.000750         0.158777        0.005875
3          0.149065         0.001235         0.158294        0.005947
4          0.146132         0.001554         0.154637        0.005209
cv = xgb.cv(params, DTrain, num_boost_round = 600, nfold=10,
            shuffle = True)
# Print the first five rows of the CV results data frame
print(cv.head())

# Calculate the mean of the test AUC scores
print(np.mean(cv['test-error-mean']).round(2))

# Plot the test AUC scores for each iteration
plt.plot(cv['test-error-mean'])
plt.title('Test Error Score Over 600 Iterations')
plt.xlabel('Iteration Number')
plt.ylabel('Test Error Score')
plt.show()
   train-error-mean  train-error-std  test-error-mean  test-error-std
0          0.164689         0.001560         0.170438        0.007783
1          0.156339         0.003456         0.163262        0.006057
2          0.152191         0.001917         0.158846        0.007608
3          0.148910         0.001316         0.156706        0.008268
4          0.146732         0.001538         0.155051        0.006846
0.13

png

b1_preds = xgboost_model.predict(X_test)
b1_preds
array([0, 0, 0, ..., 0, 0, 0])
target_names = ['Non-Cancel', 'Cancel']
print(classification_report(y_test, b1_preds, target_names=target_names))
              precision    recall  f1-score   support

  Non-Cancel       0.88      0.92      0.90      2436
      Cancel       0.83      0.75      0.78      1188

    accuracy                           0.87      3624
   macro avg       0.85      0.83      0.84      3624
weighted avg       0.86      0.87      0.86      3624
import plotly.express as px
from textwrap import wrap

label = ['Random Forest','K Nearest Neighbours','Logistics Regression', 'XGBoost_1', 'XGBoost_2']
fig = px.pie(labels=label,values=[k,m,p, b1, b2], width = 700,names=label, height = 700)
fig.update_traces(textposition = 'inside', 
                  textinfo = 'percent + label', 
                  hole = 0.50, 
                  #marker = dict(colors = ['#636EFA','#EF553B','#00CC96','#AB63FA'], line = dict(color = 'white', width = 2)))
                  #marker = dict(color_continuous_scale = px.colors.sequential.Viridis, line = dict(color = 'white', width = 2))
                  #marker = dict(colors = px.colors.sequential.Plasma_r, line = dict(color = 'white', width = 2)))
                  #marker = dict(colors =px.colors.sequential.Plasma, line = dict(color = 'white', width = 2)))
                  marker = dict(colors = px.colors.sequential.YlGnBu, line = dict(color = 'white', width = 2)))

fig.update_layout(annotations = [dict(text = 'Performance Comparison', 
                                      x = 0.5, y = 0.5, font_size = 20, showarrow = False, 
                                      font_family = 'monospace',
                                      font_color = 'black')],
                  showlegend = False)

png