import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
data = pd.read_csv('TrainData.csv')
#booking status is y
data.head()
no_of_adults | no_of_children | no_of_weekend_nights | no_of_week_nights | type_of_meal_plan | required_car_parking_space | room_type_reserved | lead_time | arrival_year | arrival_month | arrival_date | market_segment_type | repeated_guest | no_of_previous_cancellations | no_of_previous_bookings_not_canceled | avg_price_per_room | no_of_special_requests | booking_status | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2 | 0 | 1 | 4 | 0 | 0 | 0 | 118 | 2017 | 12 | 28 | 1 | 0 | 0 | 0 | 110.80 | 2 | 0 |
1 | 2 | 1 | 0 | 2 | 0 | 0 | 0 | 17 | 2018 | 4 | 14 | 1 | 0 | 0 | 0 | 145.00 | 0 | 1 |
2 | 1 | 0 | 1 | 5 | 0 | 0 | 0 | 349 | 2018 | 10 | 4 | 0 | 0 | 0 | 0 | 96.67 | 0 | 1 |
3 | 1 | 0 | 2 | 4 | 0 | 0 | 0 | 69 | 2018 | 6 | 12 | 0 | 0 | 0 | 0 | 120.00 | 0 | 1 |
4 | 2 | 0 | 0 | 4 | 1 | 0 | 0 | 11 | 2018 | 1 | 20 | 1 | 0 | 0 | 0 | 69.50 | 1 | 0 |
data.describe()
no_of_adults | no_of_children | no_of_weekend_nights | no_of_week_nights | type_of_meal_plan | required_car_parking_space | room_type_reserved | lead_time | arrival_year | arrival_month | arrival_date | market_segment_type | repeated_guest | no_of_previous_cancellations | no_of_previous_bookings_not_canceled | avg_price_per_room | no_of_special_requests | booking_status | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
count | 18137.000000 | 18137.000000 | 18137.000000 | 18137.000000 | 18137.000000 | 18137.000000 | 18137.000000 | 18137.000000 | 18137.000000 | 18137.000000 | 18137.000000 | 18137.000000 | 18137.000000 | 18137.000000 | 18137.000000 | 18137.000000 | 18137.000000 | 18137.000000 |
mean | 1.846777 | 0.107515 | 0.811104 | 2.208965 | 0.318465 | 0.031648 | 0.336770 | 85.377405 | 2017.820698 | 7.432762 | 15.660804 | 0.806197 | 0.025087 | 0.022440 | 0.151403 | 103.478868 | 0.617522 | 0.327618 |
std | 0.516020 | 0.408901 | 0.873470 | 1.426365 | 0.629140 | 0.175066 | 0.772865 | 86.611736 | 0.383616 | 3.076999 | 8.772788 | 0.645972 | 0.156393 | 0.370078 | 1.714135 | 35.474103 | 0.787941 | 0.469357 |
min | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 2017.000000 | 1.000000 | 1.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
25% | 2.000000 | 0.000000 | 0.000000 | 1.000000 | 0.000000 | 0.000000 | 0.000000 | 17.000000 | 2018.000000 | 5.000000 | 8.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 80.300000 | 0.000000 | 0.000000 |
50% | 2.000000 | 0.000000 | 1.000000 | 2.000000 | 0.000000 | 0.000000 | 0.000000 | 57.000000 | 2018.000000 | 8.000000 | 16.000000 | 1.000000 | 0.000000 | 0.000000 | 0.000000 | 99.450000 | 0.000000 | 0.000000 |
75% | 2.000000 | 0.000000 | 2.000000 | 3.000000 | 0.000000 | 0.000000 | 0.000000 | 127.000000 | 2018.000000 | 10.000000 | 23.000000 | 1.000000 | 0.000000 | 0.000000 | 0.000000 | 120.270000 | 1.000000 | 1.000000 |
max | 4.000000 | 9.000000 | 7.000000 | 17.000000 | 3.000000 | 1.000000 | 6.000000 | 443.000000 | 2018.000000 | 12.000000 | 31.000000 | 4.000000 | 1.000000 | 13.000000 | 58.000000 | 540.000000 | 5.000000 | 1.000000 |
import seaborn as sns
plt.figure(figsize = (14,7))
sns.heatmap(data.corr().round(2), annot =True, cmap='YlGnBu')
#Negative corr (arrival year & month) market segment and lead time
#corr( repeat guest & no previous not canceled / Market segment & repeat guest) num of children / room type
<AxesSubplot: >
#check for null data & check data types
#change dates to date format
data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18137 entries, 0 to 18136
Data columns (total 18 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 no_of_adults 18137 non-null int64
1 no_of_children 18137 non-null int64
2 no_of_weekend_nights 18137 non-null int64
3 no_of_week_nights 18137 non-null int64
4 type_of_meal_plan 18137 non-null int64
5 required_car_parking_space 18137 non-null int64
6 room_type_reserved 18137 non-null int64
7 lead_time 18137 non-null int64
8 arrival_year 18137 non-null int64
9 arrival_month 18137 non-null int64
10 arrival_date 18137 non-null int64
11 market_segment_type 18137 non-null int64
12 repeated_guest 18137 non-null int64
13 no_of_previous_cancellations 18137 non-null int64
14 no_of_previous_bookings_not_canceled 18137 non-null int64
15 avg_price_per_room 18137 non-null float64
16 no_of_special_requests 18137 non-null int64
17 booking_status 18137 non-null int64
dtypes: float64(1), int64(17)
memory usage: 2.5 MB
#check for nulls
data.isnull().sum()/len(data)
no_of_adults 0.0
no_of_children 0.0
no_of_weekend_nights 0.0
no_of_week_nights 0.0
type_of_meal_plan 0.0
required_car_parking_space 0.0
room_type_reserved 0.0
lead_time 0.0
arrival_year 0.0
arrival_month 0.0
arrival_date 0.0
market_segment_type 0.0
repeated_guest 0.0
no_of_previous_cancellations 0.0
no_of_previous_bookings_not_canceled 0.0
avg_price_per_room 0.0
no_of_special_requests 0.0
booking_status 0.0
dtype: float64
#check for outliers but only on contiuous numbers
plt.title("Lead_Time", fontdict = {'fontsize': 20})
sns.boxplot(x=data["lead_time"], palette = 'YlGnBu')
<AxesSubplot: title={'center': 'Lead_Time'}, xlabel='lead_time'>
#check for outliers but only on contiuous numbers
plt.title("Avg Price Per Room", fontdict = {'fontsize': 20})
sns.boxplot(x=data["avg_price_per_room"],palette = 'YlGnBu')
<AxesSubplot: title={'center': 'Avg Price Per Room'}, xlabel='avg_price_per_room'>
sns.jointplot(x='lead_time', y='avg_price_per_room', data = data,palette = 'YlGnBu', kind='kde', fill=True)
<seaborn.axisgrid.JointGrid at 0x24a23e46df0>
sns.jointplot(x='lead_time', y='avg_price_per_room', data = data, cmap = 'YlGnBu', kind='hex')
<seaborn.axisgrid.JointGrid at 0x24a23d0ffa0>
# sns.pairplot(data)
# penguins = sns.load_dataset("penguins")
# #sns.pairplot(penguins)
# penguins.info()
plt.figure(figsize = (20, 25))
plt.subplot(5,2,3)
sns.kdeplot(x='lead_time', hue='booking_status', palette = 'Set2', fill=True, data=data)
plt.subplot(5,2,4)
sns.kdeplot(x='arrival_month', hue='booking_status', palette = 'Set2', fill=True, data=data)
plt.subplot(5,2,1)
sns.kdeplot(x='arrival_date', hue='booking_status', palette = 'Set2', fill=True, data=data)
plt.subplot(5,2,2)
sns.kdeplot(x = 'booking_status', hue = 'repeated_guest', palette = 'Set2', fill=True, data = data)
<AxesSubplot: xlabel='booking_status', ylabel='Density'>
data = data.rename(columns={'arrival_year': 'year', 'arrival_month': 'month', 'arrival_date': 'day'})
data['date'] = pd.to_datetime(data[['year', 'month', 'day']],format='%Y-%m-%d', errors='coerce')
data
no_of_adults | no_of_children | no_of_weekend_nights | no_of_week_nights | type_of_meal_plan | required_car_parking_space | room_type_reserved | lead_time | year | month | day | market_segment_type | repeated_guest | no_of_previous_cancellations | no_of_previous_bookings_not_canceled | avg_price_per_room | no_of_special_requests | booking_status | date | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2 | 0 | 1 | 4 | 0 | 0 | 0 | 118 | 2017 | 12 | 28 | 1 | 0 | 0 | 0 | 110.80 | 2 | 0 | 2017-12-28 |
1 | 2 | 1 | 0 | 2 | 0 | 0 | 0 | 17 | 2018 | 4 | 14 | 1 | 0 | 0 | 0 | 145.00 | 0 | 1 | 2018-04-14 |
2 | 1 | 0 | 1 | 5 | 0 | 0 | 0 | 349 | 2018 | 10 | 4 | 0 | 0 | 0 | 0 | 96.67 | 0 | 1 | 2018-10-04 |
3 | 1 | 0 | 2 | 4 | 0 | 0 | 0 | 69 | 2018 | 6 | 12 | 0 | 0 | 0 | 0 | 120.00 | 0 | 1 | 2018-06-12 |
4 | 2 | 0 | 0 | 4 | 1 | 0 | 0 | 11 | 2018 | 1 | 20 | 1 | 0 | 0 | 0 | 69.50 | 1 | 0 | 2018-01-20 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
18132 | 1 | 0 | 0 | 2 | 0 | 0 | 0 | 103 | 2018 | 4 | 19 | 0 | 0 | 0 | 0 | 115.00 | 0 | 1 | 2018-04-19 |
18133 | 2 | 0 | 0 | 3 | 0 | 0 | 0 | 129 | 2018 | 8 | 10 | 1 | 0 | 0 | 0 | 88.01 | 1 | 0 | 2018-08-10 |
18134 | 2 | 0 | 0 | 1 | 0 | 0 | 0 | 90 | 2018 | 7 | 13 | 1 | 0 | 0 | 0 | 105.30 | 0 | 1 | 2018-07-13 |
18135 | 2 | 0 | 0 | 3 | 0 | 0 | 0 | 18 | 2018 | 11 | 10 | 1 | 1 | 0 | 1 | 123.33 | 1 | 0 | 2018-11-10 |
18136 | 1 | 0 | 1 | 1 | 0 | 0 | 0 | 159 | 2018 | 4 | 9 | 0 | 0 | 0 | 0 | 65.00 | 0 | 0 | 2018-04-09 |
18137 rows × 19 columns
#make each reservation a 1 to count
data['reservations'] = 1
# # Use GroupBy() to compute the sum
data1 = data.groupby('date').sum()
data1
no_of_adults | no_of_children | no_of_weekend_nights | no_of_week_nights | type_of_meal_plan | required_car_parking_space | room_type_reserved | lead_time | year | month | day | market_segment_type | repeated_guest | no_of_previous_cancellations | no_of_previous_bookings_not_canceled | avg_price_per_room | no_of_special_requests | booking_status | reservations | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
date | |||||||||||||||||||
2017-07-01 | 61 | 0 | 4 | 76 | 60 | 0 | 0 | 8001 | 68578 | 238 | 34 | 10 | 0 | 0 | 0 | 3134.00 | 3 | 25 | 34 |
2017-07-02 | 2 | 0 | 0 | 3 | 0 | 0 | 0 | 79 | 2017 | 7 | 2 | 1 | 0 | 0 | 0 | 76.50 | 1 | 1 | 1 |
2017-07-03 | 2 | 0 | 1 | 2 | 0 | 0 | 0 | 80 | 2017 | 7 | 3 | 1 | 0 | 0 | 0 | 76.50 | 1 | 1 | 1 |
2017-07-04 | 1 | 0 | 1 | 1 | 0 | 0 | 0 | 106 | 2017 | 7 | 4 | 1 | 0 | 0 | 0 | 68.00 | 0 | 1 | 1 |
2017-07-05 | 2 | 0 | 2 | 1 | 1 | 0 | 0 | 71 | 2017 | 7 | 5 | 1 | 0 | 0 | 0 | 55.80 | 0 | 1 | 1 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
2018-12-27 | 164 | 8 | 38 | 268 | 9 | 0 | 31 | 7408 | 157404 | 936 | 2106 | 33 | 1 | 1 | 8 | 6705.57 | 52 | 10 | 78 |
2018-12-28 | 73 | 8 | 19 | 140 | 11 | 1 | 22 | 4043 | 74666 | 444 | 1036 | 39 | 1 | 6 | 58 | 4205.69 | 35 | 10 | 37 |
2018-12-29 | 108 | 13 | 66 | 171 | 13 | 0 | 29 | 9223 | 119062 | 708 | 1711 | 50 | 0 | 0 | 0 | 6100.70 | 48 | 14 | 59 |
2018-12-30 | 98 | 10 | 44 | 100 | 6 | 1 | 23 | 5506 | 94846 | 564 | 1410 | 35 | 0 | 0 | 0 | 6257.65 | 39 | 10 | 47 |
2018-12-31 | 51 | 4 | 39 | 42 | 8 | 1 | 7 | 3194 | 48432 | 288 | 744 | 16 | 0 | 0 | 0 | 2947.58 | 10 | 7 | 24 |
548 rows × 19 columns
from pandas.api.types import CategoricalDtype
cat_type = CategoricalDtype(categories=['Monday', 'Tuesday',
'Wednesday',
'Thursday', 'Friday',
'Saturday', 'Sunday'],
ordered=True)
data.index
RangeIndex(start=0, stop=18137, step=1)
def create_features(df, label=None):
"""
Creates time series features from datetime index.
"""
df = data1.copy()
df['date'] = df.index
df['dayofweek'] = df['date'].dt.dayofweek
df['weekday'] = df['date'].dt.day_name()
df['weekday'] = df['weekday'].astype(cat_type)
df['quarter'] = df['date'].dt.quarter
df['month'] = df['date'].dt.month
df['year'] = df['date'].dt.year
df['dayofyear'] = df['date'].dt.dayofyear
df['dayofmonth'] = df['date'].dt.day
df['weekofyear'] = df['date'].dt.isocalendar().week
#df['weekofyear'] = df['date'].dt.weekofyear
df['date_offset'] = (df.date.dt.month*100 + df.date.dt.day - 320)%1300
df['season'] = pd.cut(df['date_offset'], [0, 300, 600, 900, 1300],
labels=['Spring', 'Summer', 'Fall', 'Winter'])
X = df[['dayofweek','quarter','month','year',
'dayofyear','dayofmonth','weekofyear', 'weekday', 'season']]
if label:
y = df[label]
return X, y
return X
X, y = create_features(data1, label='reservations')
features_and_target = pd.concat([X, y], axis=1)
features_and_target.head(20)
dayofweek | quarter | month | year | dayofyear | dayofmonth | weekofyear | weekday | season | reservations | |
---|---|---|---|---|---|---|---|---|---|---|
date | ||||||||||
2017-07-01 | 5 | 3 | 7 | 2017 | 182 | 1 | 26 | Saturday | Summer | 34 |
2017-07-02 | 6 | 3 | 7 | 2017 | 183 | 2 | 26 | Sunday | Summer | 1 |
2017-07-03 | 0 | 3 | 7 | 2017 | 184 | 3 | 27 | Monday | Summer | 1 |
2017-07-04 | 1 | 3 | 7 | 2017 | 185 | 4 | 27 | Tuesday | Summer | 1 |
2017-07-05 | 2 | 3 | 7 | 2017 | 186 | 5 | 27 | Wednesday | Summer | 1 |
2017-07-06 | 3 | 3 | 7 | 2017 | 187 | 6 | 27 | Thursday | Summer | 6 |
2017-07-07 | 4 | 3 | 7 | 2017 | 188 | 7 | 27 | Friday | Summer | 8 |
2017-07-08 | 5 | 3 | 7 | 2017 | 189 | 8 | 27 | Saturday | Summer | 3 |
2017-07-10 | 0 | 3 | 7 | 2017 | 191 | 10 | 28 | Monday | Summer | 3 |
2017-07-11 | 1 | 3 | 7 | 2017 | 192 | 11 | 28 | Tuesday | Summer | 14 |
2017-07-12 | 2 | 3 | 7 | 2017 | 193 | 12 | 28 | Wednesday | Summer | 2 |
2017-07-13 | 3 | 3 | 7 | 2017 | 194 | 13 | 28 | Thursday | Summer | 6 |
2017-07-14 | 4 | 3 | 7 | 2017 | 195 | 14 | 28 | Friday | Summer | 1 |
2017-07-15 | 5 | 3 | 7 | 2017 | 196 | 15 | 28 | Saturday | Summer | 4 |
2017-07-16 | 6 | 3 | 7 | 2017 | 197 | 16 | 28 | Sunday | Summer | 3 |
2017-07-17 | 0 | 3 | 7 | 2017 | 198 | 17 | 29 | Monday | Summer | 29 |
2017-07-18 | 1 | 3 | 7 | 2017 | 199 | 18 | 29 | Tuesday | Summer | 11 |
2017-07-19 | 2 | 3 | 7 | 2017 | 200 | 19 | 29 | Wednesday | Summer | 2 |
2017-07-20 | 3 | 3 | 7 | 2017 | 201 | 20 | 29 | Thursday | Summer | 2 |
2017-07-21 | 4 | 3 | 7 | 2017 | 202 | 21 | 29 | Friday | Summer | 1 |
plt.figure(figsize = (20,25))
plt.subplot(3,2,1)
plt.gca().set_title('Bookings By Month')
sns.countplot(x = 'month', palette = 'YlGnBu', data = data.loc[data['year'] == 2018])
plt.subplot(3,2,2)
plt.gca().set_title('Variable market_segment_type')
sns.countplot(x = 'market_segment_type', palette = 'YlGnBu', data = data.loc[data['year'] == 2018])
#Need to label market segments(Offline, Online, corporate, aviation, complementary)
plt.subplot(3,2,3)
plt.gca().set_title('Variable booking_status')
sns.countplot(x = 'booking_status', palette = 'YlGnBu', data = data.loc[data['year'] == 2018])
plt.subplot(3,2,4)
plt.gca().set_title('Variable no_of_children')
sns.countplot(x = 'no_of_children', palette = 'YlGnBu', data = data.loc[data['year'] == 2018])
<AxesSubplot: title={'center': 'Variable no_of_children'}, xlabel='no_of_children', ylabel='count'>
plt.figure(figsize = (20, 25))
plt.suptitle("Booking Status By Month",fontweight="bold", fontsize=20)
plt.subplot(5,2,1)
sns.countplot(x = 'booking_status', hue = 'month', palette = 'YlGnBu', data = data.loc[data['year'] == 2018])
<AxesSubplot: xlabel='booking_status', ylabel='count'>
#change calceled from 0 and 1
plt.figure(figsize = (20, 25))
plt.suptitle("Analysis Of Variable booking_status",fontweight="bold", fontsize=20)
plt.subplot(5,2,1)
sns.countplot(x = 'booking_status', hue = 'no_of_adults', palette = 'YlGnBu', data = data.loc[data['year'] == 2018])
plt.subplot(5,2,2)
sns.countplot(x = 'booking_status', hue = 'no_of_children', palette = 'YlGnBu', data = data.loc[data['year'] == 2018])
plt.subplot(5,2,3)
sns.countplot(x = 'booking_status', hue = 'no_of_weekend_nights',palette = 'YlGnBu', data = data.loc[data['year'] == 2018])
plt.subplot(5,2,4)
sns.countplot(x = 'booking_status', hue = 'market_segment_type', palette = 'YlGnBu', data = data.loc[data['year'] == 2018])
plt.subplot(5,2,5)
sns.countplot(x = 'booking_status', hue = 'type_of_meal_plan', palette = 'YlGnBu', data = data.loc[data['year'] == 2018])
plt.subplot(5,2,6)
sns.countplot(x = 'booking_status', hue = 'required_car_parking_space', palette = 'YlGnBu', data = data.loc[data['year'] == 2018])
plt.subplot(5,2,7)
sns.countplot(x = 'booking_status', hue = 'room_type_reserved', palette = 'YlGnBu', data = data.loc[data['year'] == 2018])
plt.subplot(5,2,8)
sns.countplot(x = 'booking_status', hue = 'year', palette = 'YlGnBu', data = data.loc[data['year'] == 2018])
<AxesSubplot: xlabel='booking_status', ylabel='count'>
sns.scatterplot(data=data, x="lead_time", y="avg_price_per_room", palette = 'YlGnBu', hue = 'booking_status')
<AxesSubplot: xlabel='lead_time', ylabel='avg_price_per_room'>
fig, ax = plt.subplots(figsize=(10,5))
sns.boxplot(data = features_and_target.dropna(),
x='weekday',
y='reservations',
hue='season',
ax=ax,
linewidth=1,
palette='YlGnBu')
ax.set_title('Number of Reservations by Day of Week')
ax.set_xlabel('Day of Week')
ax.set_ylabel('reservations')
ax.legend(bbox_to_anchor=(1,1))
plt.show()
def create_features(df, label=None):
"""
Creates time series features from datetime index.
"""
df = data1.copy()
df['date'] = df.index
df['dayofweek'] = df['date'].dt.dayofweek
df['weekday'] = df['date'].dt.day_name()
df['weekday'] = df['weekday'].astype(cat_type)
df['quarter'] = df['date'].dt.quarter
df['month'] = df['date'].dt.month
df['year'] = df['date'].dt.year
df['dayofyear'] = df['date'].dt.dayofyear
df['dayofmonth'] = df['date'].dt.day
df['weekofyear'] = df['date'].dt.isocalendar().week
#df['weekofyear'] = df['date'].dt.weekofyear
df['date_offset'] = (df.date.dt.month*100 + df.date.dt.day - 320)%1300
df['season'] = pd.cut(df['date_offset'], [0, 300, 600, 900, 1300],
labels=['Spring', 'Summer', 'Fall', 'Winter'])
X = df[['dayofweek','quarter','month','year',
'dayofyear','dayofmonth','weekofyear', 'weekday', 'season']]
if label:
y = df[label]
return X, y
return X
X, y = create_features(data1, label='booking_status')
features_and_target = pd.concat([X, y], axis=1)
features_and_target.head(20)
dayofweek | quarter | month | year | dayofyear | dayofmonth | weekofyear | weekday | season | booking_status | |
---|---|---|---|---|---|---|---|---|---|---|
date | ||||||||||
2017-07-01 | 5 | 3 | 7 | 2017 | 182 | 1 | 26 | Saturday | Summer | 25 |
2017-07-02 | 6 | 3 | 7 | 2017 | 183 | 2 | 26 | Sunday | Summer | 1 |
2017-07-03 | 0 | 3 | 7 | 2017 | 184 | 3 | 27 | Monday | Summer | 1 |
2017-07-04 | 1 | 3 | 7 | 2017 | 185 | 4 | 27 | Tuesday | Summer | 1 |
2017-07-05 | 2 | 3 | 7 | 2017 | 186 | 5 | 27 | Wednesday | Summer | 1 |
2017-07-06 | 3 | 3 | 7 | 2017 | 187 | 6 | 27 | Thursday | Summer | 6 |
2017-07-07 | 4 | 3 | 7 | 2017 | 188 | 7 | 27 | Friday | Summer | 6 |
2017-07-08 | 5 | 3 | 7 | 2017 | 189 | 8 | 27 | Saturday | Summer | 1 |
2017-07-10 | 0 | 3 | 7 | 2017 | 191 | 10 | 28 | Monday | Summer | 2 |
2017-07-11 | 1 | 3 | 7 | 2017 | 192 | 11 | 28 | Tuesday | Summer | 4 |
2017-07-12 | 2 | 3 | 7 | 2017 | 193 | 12 | 28 | Wednesday | Summer | 2 |
2017-07-13 | 3 | 3 | 7 | 2017 | 194 | 13 | 28 | Thursday | Summer | 5 |
2017-07-14 | 4 | 3 | 7 | 2017 | 195 | 14 | 28 | Friday | Summer | 1 |
2017-07-15 | 5 | 3 | 7 | 2017 | 196 | 15 | 28 | Saturday | Summer | 4 |
2017-07-16 | 6 | 3 | 7 | 2017 | 197 | 16 | 28 | Sunday | Summer | 3 |
2017-07-17 | 0 | 3 | 7 | 2017 | 198 | 17 | 29 | Monday | Summer | 15 |
2017-07-18 | 1 | 3 | 7 | 2017 | 199 | 18 | 29 | Tuesday | Summer | 10 |
2017-07-19 | 2 | 3 | 7 | 2017 | 200 | 19 | 29 | Wednesday | Summer | 2 |
2017-07-20 | 3 | 3 | 7 | 2017 | 201 | 20 | 29 | Thursday | Summer | 1 |
2017-07-21 | 4 | 3 | 7 | 2017 | 202 | 21 | 29 | Friday | Summer | 1 |
fig, ax = plt.subplots(figsize=(10,5))
sns.boxplot(data = features_and_target.dropna(),
x='weekday',
y='booking_status',
hue='season',
ax=ax,
linewidth=1,
palette='YlGnBu')
ax.set_title('Number of Cancelations by Day of Week')
ax.set_xlabel('Day of Week')
ax.set_ylabel('Cancelations')
ax.legend(bbox_to_anchor=(1,1))
plt.show()
data_2018 = features_and_target.loc[features_and_target['year'] == 2018]
data_2018.head()
dayofweek | quarter | month | year | dayofyear | dayofmonth | weekofyear | weekday | season | booking_status | |
---|---|---|---|---|---|---|---|---|---|---|
date | ||||||||||
2018-01-01 | 0 | 1 | 1 | 2018 | 1 | 1 | 1 | Monday | Winter | 0 |
2018-01-02 | 1 | 1 | 1 | 2018 | 2 | 2 | 1 | Tuesday | Winter | 0 |
2018-01-03 | 2 | 1 | 1 | 2018 | 3 | 3 | 1 | Wednesday | Winter | 0 |
2018-01-04 | 3 | 1 | 1 | 2018 | 4 | 4 | 1 | Thursday | Winter | 0 |
2018-01-05 | 4 | 1 | 1 | 2018 | 5 | 5 | 1 | Friday | Winter | 0 |
fig, ax = plt.subplots(figsize=(10,5))
sns.boxplot(data = data_2018.dropna(),
x='weekday',
y='booking_status',
hue='season',
ax=ax,
linewidth=1,
palette='YlGnBu')
ax.set_title('Number of Cancelations by Day of Week 2018 Only')
ax.set_xlabel('Day of Week')
ax.set_ylabel('Cancelations')
ax.legend(bbox_to_anchor=(1,1))
plt.show()
fig, ax = plt.subplots(figsize=(15,5))
sns.boxplot(data = data_2018.dropna(),
x='weekofyear',
y='booking_status',
hue='season',
ax=ax,
linewidth=1,
palette='YlGnBu')
ax.set_title('Number of Cancelations by Day of Week 2018 Only')
ax.set_xlabel('Day of Week')
ax.set_ylabel('Cancelations')
ax.legend(bbox_to_anchor=(1,1))
plt.show()
date_data = features_and_target.copy()
data1 = data.merge(date_data, on = 'date')
data1.head()
no_of_adults | no_of_children | no_of_weekend_nights | no_of_week_nights | type_of_meal_plan | required_car_parking_space | room_type_reserved | lead_time | year_x | month_x | ... | dayofweek | quarter | month_y | year_y | dayofyear | dayofmonth | weekofyear | weekday | season | booking_status_y | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2 | 0 | 1 | 4 | 0 | 0 | 0 | 118 | 2017 | 12 | ... | 3 | 4 | 12 | 2017 | 362 | 28 | 52 | Thursday | Winter | 2 |
1 | 2 | 0 | 1 | 1 | 0 | 0 | 0 | 21 | 2017 | 12 | ... | 3 | 4 | 12 | 2017 | 362 | 28 | 52 | Thursday | Winter | 2 |
2 | 2 | 0 | 1 | 1 | 0 | 1 | 2 | 42 | 2017 | 12 | ... | 3 | 4 | 12 | 2017 | 362 | 28 | 52 | Thursday | Winter | 2 |
3 | 2 | 0 | 1 | 3 | 1 | 0 | 0 | 61 | 2017 | 12 | ... | 3 | 4 | 12 | 2017 | 362 | 28 | 52 | Thursday | Winter | 2 |
4 | 2 | 2 | 1 | 0 | 1 | 0 | 3 | 35 | 2017 | 12 | ... | 3 | 4 | 12 | 2017 | 362 | 28 | 52 | Thursday | Winter | 2 |
5 rows × 30 columns
data1.describe().T
count | mean | std | min | 25% | 50% | 75% | max | |
---|---|---|---|---|---|---|---|---|
no_of_adults | 18116.0 | 1.847262 | 0.51574 | 0.0 | 2.0 | 2.0 | 2.0 | 4.0 |
no_of_children | 18116.0 | 0.107474 | 0.408828 | 0.0 | 0.0 | 0.0 | 0.0 | 9.0 |
no_of_weekend_nights | 18116.0 | 0.810775 | 0.873802 | 0.0 | 0.0 | 1.0 | 2.0 | 7.0 |
no_of_week_nights | 18116.0 | 2.208931 | 1.426151 | 0.0 | 1.0 | 2.0 | 3.0 | 17.0 |
type_of_meal_plan | 18116.0 | 0.318613 | 0.629172 | 0.0 | 0.0 | 0.0 | 0.0 | 3.0 |
required_car_parking_space | 18116.0 | 0.031629 | 0.175016 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 |
room_type_reserved | 18116.0 | 0.336553 | 0.772457 | 0.0 | 0.0 | 0.0 | 0.0 | 6.0 |
lead_time | 18116.0 | 85.433429 | 86.63484 | 0.0 | 17.0 | 57.0 | 127.0 | 443.0 |
year_x | 18116.0 | 2017.82049 | 0.383789 | 2017.0 | 2018.0 | 2018.0 | 2018.0 | 2018.0 |
month_x | 18116.0 | 7.439059 | 3.073214 | 1.0 | 5.0 | 8.0 | 10.0 | 12.0 |
day | 18116.0 | 15.645341 | 8.7661 | 1.0 | 8.0 | 16.0 | 23.0 | 31.0 |
market_segment_type | 18116.0 | 0.805917 | 0.645484 | 0.0 | 0.0 | 1.0 | 1.0 | 4.0 |
repeated_guest | 18116.0 | 0.024895 | 0.15581 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 |
no_of_previous_cancellations | 18116.0 | 0.022411 | 0.370221 | 0.0 | 0.0 | 0.0 | 0.0 | 13.0 |
no_of_previous_bookings_not_canceled | 18116.0 | 0.150364 | 1.711651 | 0.0 | 0.0 | 0.0 | 0.0 | 58.0 |
avg_price_per_room | 18116.0 | 103.498054 | 35.461471 | 0.0 | 80.3 | 99.45 | 120.285 | 540.0 |
no_of_special_requests | 18116.0 | 0.617852 | 0.788105 | 0.0 | 0.0 | 0.0 | 1.0 | 5.0 |
booking_status_x | 18116.0 | 0.327832 | 0.469436 | 0.0 | 0.0 | 0.0 | 1.0 | 1.0 |
reservations | 18116.0 | 1.0 | 0.0 | 1.0 | 1.0 | 1.0 | 1.0 | 1.0 |
dayofweek | 18116.0 | 3.090197 | 2.062916 | 0.0 | 1.0 | 3.0 | 5.0 | 6.0 |
quarter | 18116.0 | 2.801391 | 1.033189 | 1.0 | 2.0 | 3.0 | 4.0 | 4.0 |
month_y | 18116.0 | 7.439059 | 3.073214 | 1.0 | 5.0 | 8.0 | 10.0 | 12.0 |
year_y | 18116.0 | 2017.82049 | 0.383789 | 2017.0 | 2018.0 | 2018.0 | 2018.0 | 2018.0 |
dayofyear | 18116.0 | 210.667311 | 93.777748 | 1.0 | 135.0 | 226.0 | 286.0 | 365.0 |
dayofmonth | 18116.0 | 15.645341 | 8.7661 | 1.0 | 8.0 | 16.0 | 23.0 | 31.0 |
weekofyear | 18116.0 | 30.416483 | 13.389291 | 1.0 | 20.0 | 33.0 | 41.0 | 52.0 |
booking_status_y | 18116.0 | 17.444138 | 15.874724 | 0.0 | 6.0 | 14.0 | 23.0 | 85.0 |
data1['booking_status_x']
0 0
1 0
2 0
3 0
4 0
..
18111 1
18112 0
18113 1
18114 0
18115 1
Name: booking_status_x, Length: 18116, dtype: int64
#inspect correlation between label and features
print(data1.corr()["booking_status_x"].sort_values(ascending=False))
booking_status_x 1.000000
lead_time 0.434283
booking_status_y 0.355568
year_y 0.183568
year_x 0.183568
avg_price_per_room 0.145339
no_of_week_nights 0.096321
no_of_adults 0.093965
type_of_meal_plan 0.076771
no_of_weekend_nights 0.061341
no_of_children 0.035009
dayofweek 0.028634
room_type_reserved 0.021954
day 0.012104
dayofmonth 0.012104
quarter 0.000175
weekofyear -0.011193
dayofyear -0.011705
month_y -0.012305
month_x -0.012305
no_of_previous_cancellations -0.032113
market_segment_type -0.045607
no_of_previous_bookings_not_canceled -0.060390
required_car_parking_space -0.092620
repeated_guest -0.107060
no_of_special_requests -0.248649
reservations NaN
Name: booking_status_x, dtype: float64
C:\Users\Brett\AppData\Local\Temp\ipykernel_21676\86188331.py:2: FutureWarning: The default value of numeric_only in DataFrame.corr is deprecated. In a future version, it will default to False. Select only valid columns or specify the value of numeric_only to silence this warning.
print(data1.corr()["booking_status_x"].sort_values(ascending=False))
data1 = data1.drop(['reservations', 'booking_status_y'], axis = 1)
data1.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 18116 entries, 0 to 18115
Data columns (total 28 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 no_of_adults 18116 non-null int64
1 no_of_children 18116 non-null int64
2 no_of_weekend_nights 18116 non-null int64
3 no_of_week_nights 18116 non-null int64
4 type_of_meal_plan 18116 non-null int64
5 required_car_parking_space 18116 non-null int64
6 room_type_reserved 18116 non-null int64
7 lead_time 18116 non-null int64
8 year_x 18116 non-null int64
9 month_x 18116 non-null int64
10 day 18116 non-null int64
11 market_segment_type 18116 non-null int64
12 repeated_guest 18116 non-null int64
13 no_of_previous_cancellations 18116 non-null int64
14 no_of_previous_bookings_not_canceled 18116 non-null int64
15 avg_price_per_room 18116 non-null float64
16 no_of_special_requests 18116 non-null int64
17 booking_status_x 18116 non-null int64
18 date 18116 non-null datetime64[ns]
19 dayofweek 18116 non-null int64
20 quarter 18116 non-null int64
21 month_y 18116 non-null int64
22 year_y 18116 non-null int64
23 dayofyear 18116 non-null int64
24 dayofmonth 18116 non-null int64
25 weekofyear 18116 non-null UInt32
26 weekday 18116 non-null category
27 season 18052 non-null category
dtypes: UInt32(1), category(2), datetime64[ns](1), float64(1), int64(23)
memory usage: 3.7 MB
X = data1[['avg_price_per_room','no_of_special_requests','market_segment_type','lead_time','required_car_parking_space']]
y = data1['booking_status_x']
print(X.head(4))
print(y.head(4))
avg_price_per_room no_of_special_requests market_segment_type lead_time \
0 110.8 2 1 118
1 107.0 2 1 21
2 91.5 0 1 42
3 92.5 1 1 61
required_car_parking_space
0 0
1 0
2 1
3 0
0 0
1 0
2 0
3 0
Name: booking_status_x, dtype: int64
#random forest model
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
#random forest model
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,random_state=101)
rfc = RandomForestClassifier(n_estimators=400)
rfc.fit(X_train, y_train)
rfc_pred = rfc.predict(X_test)
print(accuracy_score(y_test,rfc_pred)*100)
k=accuracy_score(y_test,rfc_pred)*100
print('\n')
print(confusion_matrix(y_test,rfc_pred))
print('\n')
print(classification_report(y_test,rfc_pred))
85.22539098436063
[[3313 339]
[ 464 1319]]
precision recall f1-score support
0 0.88 0.91 0.89 3652
1 0.80 0.74 0.77 1783
accuracy 0.85 5435
macro avg 0.84 0.82 0.83 5435
weighted avg 0.85 0.85 0.85 5435
#KNN
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix
scale= StandardScaler()
scale.fit(X)
scaled_features = scale.transform(X)
df_feat = pd.DataFrame(X)
#X = df[['avg_price_per_room','no_of_special_requests','market_segment_type','arrival_month','lead_time']]
y = data1['booking_status_x']
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.80,random_state=101)
knn = KNeighborsClassifier(n_neighbors=1)
pred = knn.fit(X_train,y_train)
pred = knn.predict(X_test)
print('KNN Accuracy score is: ',accuracy_score(y_test,pred)*100)
print('\n')
print(confusion_matrix(y_test,pred))
print('\n')
print(classification_report(y_test,pred))
KNN Accuracy score is: 74.27723728696613
[[7752 2017]
[1711 3013]]
precision recall f1-score support
0 0.82 0.79 0.81 9769
1 0.60 0.64 0.62 4724
accuracy 0.74 14493
macro avg 0.71 0.72 0.71 14493
weighted avg 0.75 0.74 0.74 14493
plt.figure(figsize=(14,6))
error = []
for i in range(1,20):
knn = KNeighborsClassifier(n_neighbors=i)
knn.fit(X_train,y_train)
pred_i = knn.predict(X_test)
error.append(np.mean(pred_i != y_test))
sns.set_style('whitegrid')
plt.plot(range(1,20),error,color='blue', linestyle='dashed', marker='o',
markerfacecolor='red', markersize=10)
plt.title('Error Rate vs. K Value')
plt.xlabel('K')
plt.ylabel('Error Rate')
Text(0, 0.5, 'Error Rate')
knn = KNeighborsClassifier(n_neighbors=2)
pred = knn.fit(X_train,y_train)
pred = knn.predict(X_test)
#print('LogisticRegression score is: ',np.round(model.score(y_test,pred)*100,decimals=2))
print('\n')
print('Best KNN Accuracy score is: ',accuracy_score(y_test,pred)*100)
print('\n')
m=accuracy_score(y_test,pred)*100
print(confusion_matrix(y_test,pred))
print('\n')
print(classification_report(y_test,pred))
Best KNN Accuracy score is: 77.95487476712896
[[9052 717]
[2478 2246]]
precision recall f1-score support
0 0.79 0.93 0.85 9769
1 0.76 0.48 0.58 4724
accuracy 0.78 14493
macro avg 0.77 0.70 0.72 14493
weighted avg 0.78 0.78 0.76 14493
#logistical Model
from sklearn.linear_model import LogisticRegression
y = data1['booking_status_x']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.40 ,random_state=101)
logmodel = LogisticRegression()
logmodel.fit(X_train,y_train)
predictions = logmodel.predict(X_test)
print('\n')
print('Accuracy score is: ',accuracy_score(y_test,predictions)*100)
p=accuracy_score(y_test,predictions)*100
print('\n')
print(confusion_matrix(y_test,predictions))
print('\n')
print(classification_report(y_test,predictions))
Accuracy score is: 78.52904650200084
[[4359 522]
[1034 1332]]
precision recall f1-score support
0 0.81 0.89 0.85 4881
1 0.72 0.56 0.63 2366
accuracy 0.79 7247
macro avg 0.76 0.73 0.74 7247
weighted avg 0.78 0.79 0.78 7247
sns.set_context("poster", font_scale = .75)
cm = confusion_matrix(y_test, predictions)
fig, ax = plt.subplots(figsize=(6, 5))
ax.imshow(cm)
ax.grid(False)
ax.xaxis.set(ticks=(0, 1), ticklabels=('Predicted 0s', 'Predicted 1s'))
ax.yaxis.set(ticks=(0, 1), ticklabels=('Actual 0s', 'Actual 1s'))
ax.set_ylim(1.5, -0.5)
for i in range(2):
for j in range(2):
ax.text(j, i, cm[i, j], ha='center', va='center', color='red')
plt.show()
import plotly.express as px
from plotly import graph_objects
from textwrap import wrap
import chart_studio.plotly as py
named_colorscales = px.colors.named_colorscales()
print("\n".join(wrap("".join('{:<12}'.format(c) for c in named_colorscales), 96)))
aggrnyl agsunset blackbody bluered blues blugrn bluyl brwnyl
bugn bupu burg burgyl cividis darkmint electric emrld
gnbu greens greys hot inferno jet magenta magma
mint orrd oranges oryel peach pinkyl plasma plotly3
pubu pubugn purd purp purples purpor rainbow rdbu
rdpu redor reds sunset sunsetdark teal tealgrn turbo
viridis ylgn ylgnbu ylorbr ylorrd algae amp deep
dense gray haline ice matter solar speed tempo
thermal turbid armyrose brbg earth fall geyser prgn
piyg picnic portland puor rdgy rdylbu rdylgn spectral
tealrose temps tropic balance curl delta oxy edge
hsv icefire phase twilight mrybm mygbm
!pip install chart-studio
Requirement already satisfied: chart-studio in c:\users\brett\anaconda3\envs\school\lib\site-packages (1.1.0)
Requirement already satisfied: plotly in c:\users\brett\anaconda3\envs\school\lib\site-packages (from chart-studio) (5.13.0)
Requirement already satisfied: requests in c:\users\brett\anaconda3\envs\school\lib\site-packages (from chart-studio) (2.28.1)
Requirement already satisfied: six in c:\users\brett\anaconda3\envs\school\lib\site-packages (from chart-studio) (1.16.0)
Requirement already satisfied: retrying>=1.3.3 in c:\users\brett\anaconda3\envs\school\lib\site-packages (from chart-studio) (1.3.4)
Requirement already satisfied: tenacity>=6.2.0 in c:\users\brett\anaconda3\envs\school\lib\site-packages (from plotly->chart-studio) (8.2.0)
Requirement already satisfied: urllib3<1.27,>=1.21.1 in c:\users\brett\anaconda3\envs\school\lib\site-packages (from requests->chart-studio) (1.26.14)
Requirement already satisfied: certifi>=2017.4.17 in c:\users\brett\anaconda3\envs\school\lib\site-packages (from requests->chart-studio) (2022.12.7)
Requirement already satisfied: idna<4,>=2.5 in c:\users\brett\anaconda3\envs\school\lib\site-packages (from requests->chart-studio) (3.4)
Requirement already satisfied: charset-normalizer<3,>=2 in c:\users\brett\anaconda3\envs\school\lib\site-packages (from requests->chart-studio) (2.0.4)
import xgboost as xgb
from sklearn.model_selection import train_test_split
xg_cl = xgb.XGBClassifier(objective = 'binary:logistic', n_estimators = 10, seed = 123)
xg_cl.fit(X_train, y_train)
preds = xg_cl.predict(X_test)
accuracy = float(np.sum(preds==y_test))/y_test.shape[0]
b1 = accuracy*100
print('accuracy: %f' %(accuracy))
accuracy: 0.844901
import warnings
warnings.filterwarnings('ignore')
#inspect correlation between label and features
print(data1.corr()["booking_status_x"].sort_values(ascending=False))
booking_status_x 1.000000
lead_time 0.434283
year_y 0.183568
year_x 0.183568
avg_price_per_room 0.145339
no_of_week_nights 0.096321
no_of_adults 0.093965
type_of_meal_plan 0.076771
no_of_weekend_nights 0.061341
no_of_children 0.035009
dayofweek 0.028634
room_type_reserved 0.021954
day 0.012104
dayofmonth 0.012104
quarter 0.000175
weekofyear -0.011193
dayofyear -0.011705
month_y -0.012305
month_x -0.012305
no_of_previous_cancellations -0.032113
market_segment_type -0.045607
no_of_previous_bookings_not_canceled -0.060390
required_car_parking_space -0.092620
repeated_guest -0.107060
no_of_special_requests -0.248649
Name: booking_status_x, dtype: float64
X = data1[['avg_price_per_room',
'no_of_special_requests',
'market_segment_type',
'lead_time',
'required_car_parking_space',
'repeated_guest',
'no_of_week_nights',
'no_of_adults']]
y = data1['booking_status_x']
print(X.head(4))
print(y.head(4))
avg_price_per_room no_of_special_requests market_segment_type lead_time \
0 110.8 2 1 118
1 107.0 2 1 21
2 91.5 0 1 42
3 92.5 1 1 61
required_car_parking_space repeated_guest no_of_week_nights no_of_adults
0 0 0 4 2
1 0 0 1 2
2 1 0 1 2
3 0 0 3 2
0 0
1 0
2 0
3 0
Name: booking_status_x, dtype: int64
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, stratify = y, random_state = 123)
!pip install category-encoders
Requirement already satisfied: category-encoders in c:\users\brett\anaconda3\envs\school\lib\site-packages (2.6.0)
Requirement already satisfied: scipy>=1.0.0 in c:\users\brett\anaconda3\envs\school\lib\site-packages (from category-encoders) (1.10.0)
Requirement already satisfied: patsy>=0.5.1 in c:\users\brett\anaconda3\envs\school\lib\site-packages (from category-encoders) (0.5.3)
Requirement already satisfied: pandas>=1.0.5 in c:\users\brett\anaconda3\envs\school\lib\site-packages (from category-encoders) (1.5.3)
Requirement already satisfied: numpy>=1.14.0 in c:\users\brett\anaconda3\envs\school\lib\site-packages (from category-encoders) (1.23.5)
Requirement already satisfied: statsmodels>=0.9.0 in c:\users\brett\anaconda3\envs\school\lib\site-packages (from category-encoders) (0.13.5)
Requirement already satisfied: scikit-learn>=0.20.0 in c:\users\brett\anaconda3\envs\school\lib\site-packages (from category-encoders) (1.2.1)
Requirement already satisfied: python-dateutil>=2.8.1 in c:\users\brett\anaconda3\envs\school\lib\site-packages (from pandas>=1.0.5->category-encoders) (2.8.2)
Requirement already satisfied: pytz>=2020.1 in c:\users\brett\anaconda3\envs\school\lib\site-packages (from pandas>=1.0.5->category-encoders) (2022.7)
Requirement already satisfied: six in c:\users\brett\anaconda3\envs\school\lib\site-packages (from patsy>=0.5.1->category-encoders) (1.16.0)
Requirement already satisfied: joblib>=1.1.1 in c:\users\brett\anaconda3\envs\school\lib\site-packages (from scikit-learn>=0.20.0->category-encoders) (1.2.0)
Requirement already satisfied: threadpoolctl>=2.0.0 in c:\users\brett\anaconda3\envs\school\lib\site-packages (from scikit-learn>=0.20.0->category-encoders) (3.1.0)
Requirement already satisfied: packaging>=21.3 in c:\users\brett\anaconda3\envs\school\lib\site-packages (from statsmodels>=0.9.0->category-encoders) (22.0)
conda install -c anaconda py-xgboost
Retrieving notices: ...working... done
Collecting package metadata (current_repodata.json): ...working... done
Solving environment: ...working... done
# All requested packages already installed.
Note: you may need to restart the kernel to use updated packages.
==> WARNING: A newer version of conda exists. <==
current version: 23.1.0
latest version: 23.7.3
Please update conda by running
$ conda update -n base -c defaults conda
Or to minimize the number of packages updated during conda update use
conda install conda=23.7.3
from sklearn.pipeline import Pipeline
from category_encoders.target_encoder import TargetEncoder
#from xgboost import XGBclassifier
import xgboost as xgb
#from xgboost import XGBclassifier
estimators = [
('encoder', TargetEncoder()),
('clf', xgb.XGBClassifier(random_state = 123))
]
pipe = Pipeline(steps = estimators)
pipe
Pipeline(steps=[('encoder', TargetEncoder()), ('clf', XGBClassifier(base_score=None, booster=None, callbacks=None, colsample_bylevel=None, colsample_bynode=None, colsample_bytree=None, early_stopping_rounds=None, enable_categorical=False, eval_metric=None, feature_types=None, gamma=None, gpu_id=None, grow_policy=None, importance_type=None, interaction_constraints=None, learning_rate=None, max_bin=None, max_cat_threshold=None, max_cat_to_onehot=None, max_delta_step=None, max_depth=None, max_leaves=None, min_child_weight=None, missing=nan, monotone_constraints=None, n_estimators=100, n_jobs=None, num_parallel_tree=None, predictor=None, random_state=123, ...))])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
Pipeline(steps=[('encoder', TargetEncoder()), ('clf', XGBClassifier(base_score=None, booster=None, callbacks=None, colsample_bylevel=None, colsample_bynode=None, colsample_bytree=None, early_stopping_rounds=None, enable_categorical=False, eval_metric=None, feature_types=None, gamma=None, gpu_id=None, grow_policy=None, importance_type=None, interaction_constraints=None, learning_rate=None, max_bin=None, max_cat_threshold=None, max_cat_to_onehot=None, max_delta_step=None, max_depth=None, max_leaves=None, min_child_weight=None, missing=nan, monotone_constraints=None, n_estimators=100, n_jobs=None, num_parallel_tree=None, predictor=None, random_state=123, ...))])
TargetEncoder()
XGBClassifier(base_score=None, booster=None, callbacks=None, colsample_bylevel=None, colsample_bynode=None, colsample_bytree=None, early_stopping_rounds=None, enable_categorical=False, eval_metric=None, feature_types=None, gamma=None, gpu_id=None, grow_policy=None, importance_type=None, interaction_constraints=None, learning_rate=None, max_bin=None, max_cat_threshold=None, max_cat_to_onehot=None, max_delta_step=None, max_depth=None, max_leaves=None, min_child_weight=None, missing=nan, monotone_constraints=None, n_estimators=100, n_jobs=None, num_parallel_tree=None, predictor=None, random_state=123, ...)
!pip install scikit-optimize
Requirement already satisfied: scikit-optimize in c:\users\brett\anaconda3\envs\school\lib\site-packages (0.9.0)
Requirement already satisfied: joblib>=0.11 in c:\users\brett\anaconda3\envs\school\lib\site-packages (from scikit-optimize) (1.2.0)
Requirement already satisfied: pyaml>=16.9 in c:\users\brett\anaconda3\envs\school\lib\site-packages (from scikit-optimize) (21.10.1)
Requirement already satisfied: scikit-learn>=0.20.0 in c:\users\brett\anaconda3\envs\school\lib\site-packages (from scikit-optimize) (1.2.1)
Requirement already satisfied: numpy>=1.13.3 in c:\users\brett\anaconda3\envs\school\lib\site-packages (from scikit-optimize) (1.23.5)
Requirement already satisfied: scipy>=0.19.1 in c:\users\brett\anaconda3\envs\school\lib\site-packages (from scikit-optimize) (1.10.0)
Requirement already satisfied: PyYAML in c:\users\brett\anaconda3\envs\school\lib\site-packages (from pyaml>=16.9->scikit-optimize) (6.0)
Requirement already satisfied: threadpoolctl>=2.0.0 in c:\users\brett\anaconda3\envs\school\lib\site-packages (from scikit-learn>=0.20.0->scikit-optimize) (3.1.0)
from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer
search_space = {
'clf__max_depth': Integer(2,8),
'clf__learning_rate': Real(0.001, 1.0, prior='log-uniform'),
'clf__subsample': Real(0.5, 1.0),
'clf__colsample_bytree': Real(0.5, 1.0),
'clf__colsample_bylevel': Real(0.5, 1.0),
'clf__colsample_bynode' : Real(0.5, 1.0),
'clf__reg_alpha': Real(0.0, 10.0),
'clf__reg_lambda': Real(0.0, 10.0),
'clf__gamma': Real(0.0, 10.0)
}
opt = BayesSearchCV(pipe, search_space, cv=3, n_iter=10, scoring='roc_auc', random_state=8)
#train xgboost model
opt.fit(X_train, y_train)
Warning: No categorical columns found. Calling 'transform' will only return input data.
Warning: No categorical columns found. Calling 'transform' will only return input data.
Warning: No categorical columns found. Calling 'transform' will only return input data.
Warning: No categorical columns found. Calling 'transform' will only return input data.
Warning: No categorical columns found. Calling 'transform' will only return input data.
Warning: No categorical columns found. Calling 'transform' will only return input data.
Warning: No categorical columns found. Calling 'transform' will only return input data.
Warning: No categorical columns found. Calling 'transform' will only return input data.
Warning: No categorical columns found. Calling 'transform' will only return input data.
Warning: No categorical columns found. Calling 'transform' will only return input data.
Warning: No categorical columns found. Calling 'transform' will only return input data.
Warning: No categorical columns found. Calling 'transform' will only return input data.
Warning: No categorical columns found. Calling 'transform' will only return input data.
Warning: No categorical columns found. Calling 'transform' will only return input data.
Warning: No categorical columns found. Calling 'transform' will only return input data.
Warning: No categorical columns found. Calling 'transform' will only return input data.
Warning: No categorical columns found. Calling 'transform' will only return input data.
Warning: No categorical columns found. Calling 'transform' will only return input data.
Warning: No categorical columns found. Calling 'transform' will only return input data.
Warning: No categorical columns found. Calling 'transform' will only return input data.
Warning: No categorical columns found. Calling 'transform' will only return input data.
Warning: No categorical columns found. Calling 'transform' will only return input data.
Warning: No categorical columns found. Calling 'transform' will only return input data.
Warning: No categorical columns found. Calling 'transform' will only return input data.
Warning: No categorical columns found. Calling 'transform' will only return input data.
Warning: No categorical columns found. Calling 'transform' will only return input data.
Warning: No categorical columns found. Calling 'transform' will only return input data.
Warning: No categorical columns found. Calling 'transform' will only return input data.
Warning: No categorical columns found. Calling 'transform' will only return input data.
Warning: No categorical columns found. Calling 'transform' will only return input data.
Warning: No categorical columns found. Calling 'transform' will only return input data.
BayesSearchCV(cv=3, estimator=Pipeline(steps=[('encoder', TargetEncoder()), ('clf', XGBClassifier(base_score=None, booster=None, callbacks=None, colsample_bylevel=None, colsample_bynode=None, colsample_bytree=None, early_stopping_rounds=None, enable_categorical=False, eval_metric=None, feature_types=None, gamma=None, gpu_id=None, grow_policy=None, importance_type=N... 'clf__learning_rate': Real(low=0.001, high=1.0, prior='log-uniform', transform='normalize'), 'clf__max_depth': Integer(low=2, high=8, prior='uniform', transform='normalize'), 'clf__reg_alpha': Real(low=0.0, high=10.0, prior='uniform', transform='normalize'), 'clf__reg_lambda': Real(low=0.0, high=10.0, prior='uniform', transform='normalize'), 'clf__subsample': Real(low=0.5, high=1.0, prior='uniform', transform='normalize')})In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
BayesSearchCV(cv=3, estimator=Pipeline(steps=[('encoder', TargetEncoder()), ('clf', XGBClassifier(base_score=None, booster=None, callbacks=None, colsample_bylevel=None, colsample_bynode=None, colsample_bytree=None, early_stopping_rounds=None, enable_categorical=False, eval_metric=None, feature_types=None, gamma=None, gpu_id=None, grow_policy=None, importance_type=N... 'clf__learning_rate': Real(low=0.001, high=1.0, prior='log-uniform', transform='normalize'), 'clf__max_depth': Integer(low=2, high=8, prior='uniform', transform='normalize'), 'clf__reg_alpha': Real(low=0.0, high=10.0, prior='uniform', transform='normalize'), 'clf__reg_lambda': Real(low=0.0, high=10.0, prior='uniform', transform='normalize'), 'clf__subsample': Real(low=0.5, high=1.0, prior='uniform', transform='normalize')})
Pipeline(steps=[('encoder', TargetEncoder()), ('clf', XGBClassifier(base_score=None, booster=None, callbacks=None, colsample_bylevel=None, colsample_bynode=None, colsample_bytree=None, early_stopping_rounds=None, enable_categorical=False, eval_metric=None, feature_types=None, gamma=None, gpu_id=None, grow_policy=None, importance_type=None, interaction_constraints=None, learning_rate=None, max_bin=None, max_cat_threshold=None, max_cat_to_onehot=None, max_delta_step=None, max_depth=None, max_leaves=None, min_child_weight=None, missing=nan, monotone_constraints=None, n_estimators=100, n_jobs=None, num_parallel_tree=None, predictor=None, random_state=123, ...))])
TargetEncoder()
XGBClassifier(base_score=None, booster=None, callbacks=None, colsample_bylevel=None, colsample_bynode=None, colsample_bytree=None, early_stopping_rounds=None, enable_categorical=False, eval_metric=None, feature_types=None, gamma=None, gpu_id=None, grow_policy=None, importance_type=None, interaction_constraints=None, learning_rate=None, max_bin=None, max_cat_threshold=None, max_cat_to_onehot=None, max_delta_step=None, max_depth=None, max_leaves=None, min_child_weight=None, missing=nan, monotone_constraints=None, n_estimators=100, n_jobs=None, num_parallel_tree=None, predictor=None, random_state=123, ...)
opt.best_estimator_
Pipeline(steps=[('encoder', TargetEncoder(cols=[])), ('clf', XGBClassifier(base_score=None, booster=None, callbacks=None, colsample_bylevel=0.9425384185492701, colsample_bynode=0.9095956806239844, colsample_bytree=0.706128679361455, early_stopping_rounds=None, enable_categorical=False, eval_metric=None, feature_types=None, gamma=1.6598135411398998, gpu_id=None, g...icy=None, importance_type=None, interaction_constraints=None, learning_rate=0.7929828265552742, max_bin=None, max_cat_threshold=None, max_cat_to_onehot=None, max_delta_step=None, max_depth=7, max_leaves=None, min_child_weight=None, missing=nan, monotone_constraints=None, n_estimators=100, n_jobs=None, num_parallel_tree=None, predictor=None, random_state=123, ...))])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
Pipeline(steps=[('encoder', TargetEncoder(cols=[])), ('clf', XGBClassifier(base_score=None, booster=None, callbacks=None, colsample_bylevel=0.9425384185492701, colsample_bynode=0.9095956806239844, colsample_bytree=0.706128679361455, early_stopping_rounds=None, enable_categorical=False, eval_metric=None, feature_types=None, gamma=1.6598135411398998, gpu_id=None, g...icy=None, importance_type=None, interaction_constraints=None, learning_rate=0.7929828265552742, max_bin=None, max_cat_threshold=None, max_cat_to_onehot=None, max_delta_step=None, max_depth=7, max_leaves=None, min_child_weight=None, missing=nan, monotone_constraints=None, n_estimators=100, n_jobs=None, num_parallel_tree=None, predictor=None, random_state=123, ...))])
TargetEncoder(cols=[])
XGBClassifier(base_score=None, booster=None, callbacks=None, colsample_bylevel=0.9425384185492701, colsample_bynode=0.9095956806239844, colsample_bytree=0.706128679361455, early_stopping_rounds=None, enable_categorical=False, eval_metric=None, feature_types=None, gamma=1.6598135411398998, gpu_id=None, grow_policy=None, importance_type=None, interaction_constraints=None, learning_rate=0.7929828265552742, max_bin=None, max_cat_threshold=None, max_cat_to_onehot=None, max_delta_step=None, max_depth=7, max_leaves=None, min_child_weight=None, missing=nan, monotone_constraints=None, n_estimators=100, n_jobs=None, num_parallel_tree=None, predictor=None, random_state=123, ...)
opt.best_score_
0.9117809899504422
opt.score(X_test, y_test)
0.9192097148275309
b2 = opt.best_score_ * 100
opt.predict(X_test)
array([0, 0, 0, ..., 0, 0, 0])
opt.predict_proba(X_test)
array([[0.9834009 , 0.01659912],
[0.86485183, 0.13514817],
[0.8582463 , 0.14175366],
...,
[0.99780715, 0.00219283],
[0.95073146, 0.04926857],
[0.55550265, 0.44449738]], dtype=float32)
opt.best_estimator_.steps
[('encoder', TargetEncoder(cols=[])),
('clf',
XGBClassifier(base_score=None, booster=None, callbacks=None,
colsample_bylevel=0.9425384185492701,
colsample_bynode=0.9095956806239844,
colsample_bytree=0.706128679361455, early_stopping_rounds=None,
enable_categorical=False, eval_metric=None, feature_types=None,
gamma=1.6598135411398998, gpu_id=None, grow_policy=None,
importance_type=None, interaction_constraints=None,
learning_rate=0.7929828265552742, max_bin=None,
max_cat_threshold=None, max_cat_to_onehot=None,
max_delta_step=None, max_depth=7, max_leaves=None,
min_child_weight=None, missing=nan, monotone_constraints=None,
n_estimators=100, n_jobs=None, num_parallel_tree=None,
predictor=None, random_state=123, ...))]
from xgboost import plot_importance
xgboost_step = opt.best_estimator_.steps[1]
xgboost_model = xgboost_step[1]
plot_importance(xgboost_model)
<AxesSubplot: title={'center': 'Feature importance'}, xlabel='F score', ylabel='Features'>
# Set the values for number of folds and stopping iterations
n_folds = 5
early_stopping = 10
#create params
params = {'objective': 'binary:logistic',
'seed':99,
'eval_metric':'error'}
# Create the DTrain matrix for XGBoost
DTrain = xgb.DMatrix(X_train, label = y_train)
# Create the data frame of cross validations
cv_df = xgb.cv(params, DTrain, num_boost_round = 5, nfold = n_folds,
early_stopping_rounds = early_stopping)
# Print the cross validations data frame
print(cv_df)
train-error-mean train-error-std test-error-mean test-error-std
0 0.163573 0.001738 0.173267 0.006149
1 0.156224 0.002473 0.164780 0.004829
2 0.151998 0.000750 0.158777 0.005875
3 0.149065 0.001235 0.158294 0.005947
4 0.146132 0.001554 0.154637 0.005209
cv = xgb.cv(params, DTrain, num_boost_round = 600, nfold=10,
shuffle = True)
# Print the first five rows of the CV results data frame
print(cv.head())
# Calculate the mean of the test AUC scores
print(np.mean(cv['test-error-mean']).round(2))
# Plot the test AUC scores for each iteration
plt.plot(cv['test-error-mean'])
plt.title('Test Error Score Over 600 Iterations')
plt.xlabel('Iteration Number')
plt.ylabel('Test Error Score')
plt.show()
train-error-mean train-error-std test-error-mean test-error-std
0 0.164689 0.001560 0.170438 0.007783
1 0.156339 0.003456 0.163262 0.006057
2 0.152191 0.001917 0.158846 0.007608
3 0.148910 0.001316 0.156706 0.008268
4 0.146732 0.001538 0.155051 0.006846
0.13
b1_preds = xgboost_model.predict(X_test)
b1_preds
array([0, 0, 0, ..., 0, 0, 0])
target_names = ['Non-Cancel', 'Cancel']
print(classification_report(y_test, b1_preds, target_names=target_names))
precision recall f1-score support
Non-Cancel 0.88 0.92 0.90 2436
Cancel 0.83 0.75 0.78 1188
accuracy 0.87 3624
macro avg 0.85 0.83 0.84 3624
weighted avg 0.86 0.87 0.86 3624
import plotly.express as px
from textwrap import wrap
label = ['Random Forest','K Nearest Neighbours','Logistics Regression', 'XGBoost_1', 'XGBoost_2']
fig = px.pie(labels=label,values=[k,m,p, b1, b2], width = 700,names=label, height = 700)
fig.update_traces(textposition = 'inside',
textinfo = 'percent + label',
hole = 0.50,
#marker = dict(colors = ['#636EFA','#EF553B','#00CC96','#AB63FA'], line = dict(color = 'white', width = 2)))
#marker = dict(color_continuous_scale = px.colors.sequential.Viridis, line = dict(color = 'white', width = 2))
#marker = dict(colors = px.colors.sequential.Plasma_r, line = dict(color = 'white', width = 2)))
#marker = dict(colors =px.colors.sequential.Plasma, line = dict(color = 'white', width = 2)))
marker = dict(colors = px.colors.sequential.YlGnBu, line = dict(color = 'white', width = 2)))
fig.update_layout(annotations = [dict(text = 'Performance Comparison',
x = 0.5, y = 0.5, font_size = 20, showarrow = False,
font_family = 'monospace',
font_color = 'black')],
showlegend = False)