import pandas as pd
import numpy as np
from sklearn.svm import OneClassSVM
from sklearn.svm import SVC
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
import warnings
import seaborn as sns
warnings.filterwarnings('ignore')

from google.colab import files
import io

uploaded = files.upload()

df_bank = pd.read_csv('bank-additional-full_normalised.csv')

df_bank.head()

	age	job=housemaid	job=services	job=admin.	job=blue-collar	...	previous	poutcome=nonexistent	poutcome=failure	emp.var.rate	cons.price.idx	cons.conf.idx	euribor3m	nr.employed	class
0	0.209877	0	0	0	0	...	0.000000	1	0	1.000000	0.882307	0.376569	0.980730	1.000000	0
1	0.296296	0	0	1	0	...	0.000000	1	0	1.000000	0.484412	0.615063	0.981183	1.000000	0
2	0.246914	1	0	0	0	...	0.000000	1	0	0.937500	0.698753	0.602510	0.957379	0.859735	0
3	0.160494	0	1	0	0	...	0.142857	0	1	0.333333	0.269680	0.192469	0.150759	0.512287	0
4	0.530864	0	0	0	1	...	0.000000	1	0	0.333333	0.340608	0.154812	0.174790	0.512287	1

5 rows × 63 columns

SVM Model

data = df_bank
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41188 entries, 0 to 41187
Data columns (total 63 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 age                            41188 non-null  float64
 job=housemaid                  41188 non-null  int64  
 job=services                   41188 non-null  int64  
 job=admin.                     41188 non-null  int64  
 job=blue-collar                41188 non-null  int64  
 job=technician                 41188 non-null  int64  
 job=retired                    41188 non-null  int64  
 job=management                 41188 non-null  int64  
 job=unemployed                 41188 non-null  int64  
 job=self-employed              41188 non-null  int64  
job=unknown                    41188 non-null  int64  
job=entrepreneur               41188 non-null  int64  
job=student                    41188 non-null  int64  
marital=married                41188 non-null  int64  
marital=single                 41188 non-null  int64  
marital=divorced               41188 non-null  int64  
marital=unknown                41188 non-null  int64  
education=basic.4y             41188 non-null  int64  
education=high.school          41188 non-null  int64  
education=basic.6y             41188 non-null  int64  
education=basic.9y             41188 non-null  int64  
education=professional.course  41188 non-null  int64  
education=unknown              41188 non-null  int64  
education=university.degree    41188 non-null  int64  
education=illiterate           41188 non-null  int64  
default=0                      41188 non-null  int64  
default=unknown                41188 non-null  int64  
default=1                      41188 non-null  int64  
housing=0                      41188 non-null  int64  
housing=1                      41188 non-null  int64  
housing=unknown                41188 non-null  int64  
loan=0                         41188 non-null  int64  
loan=1                         41188 non-null  int64  
loan=unknown                   41188 non-null  int64  
contact=cellular               41188 non-null  int64  
month=may                      41188 non-null  int64  
month=jun                      41188 non-null  int64  
month=jul                      41188 non-null  int64  
month=aug                      41188 non-null  int64  
month=oct                      41188 non-null  int64  
month=nov                      41188 non-null  int64  
month=dec                      41188 non-null  int64  
month=mar                      41188 non-null  int64  
month=apr                      41188 non-null  int64  
month=sep                      41188 non-null  int64  
day_of_week=mon                41188 non-null  int64  
day_of_week=tue                41188 non-null  int64  
day_of_week=wed                41188 non-null  int64  
day_of_week=thu                41188 non-null  int64  
day_of_week=fri                41188 non-null  int64  
duration                       41188 non-null  float64
campaign                       41188 non-null  float64
pdays                          41188 non-null  float64
previous                       41188 non-null  float64
poutcome=nonexistent           41188 non-null  int64  
poutcome=failure               41188 non-null  int64  
poutcome=success               41188 non-null  int64  
emp.var.rate                   41188 non-null  float64
cons.price.idx                 41188 non-null  float64
cons.conf.idx                  41188 non-null  float64
euribor3m                      41188 non-null  float64
nr.employed                    41188 non-null  float64
category                       41188 non-null  int64  
dtypes: float64(10), int64(53)
memory usage: 19.8 MB

from sklearn.svm import OneClassSVM
from sklearn.metrics import accuracy_score, classification_report
outlier_fraction = len(data[data['category']==1])/float(len(data[data['category']==0]))
clfsvm = OneClassSVM(kernel="rbf", nu=outlier_fraction)
X = data.loc[:,data.columns!='category']
Y = data['category']
y_pred = clfsvm.fit_predict(X)
y_pred[y_pred == 1] = 0
y_pred[y_pred == -1] = 1
n_errors = (y_pred != Y).sum()
print("{}: {}".format("No. of Anomalous Points with One-Class SVM ",n_errors))
print("Accuracy Score :")
print(accuracy_score(Y,y_pred))
print("Classification Report :")
print(classification_report(Y,y_pred))

No. of Anomalous Points with One-Class SVM : 7252
Accuracy Score :
0.8239292997960571
Classification Report :
              precision    recall  f1-score   support

           0       0.91      0.89      0.90     36548
           1       0.25      0.28      0.27      4640

    accuracy                           0.82     41188
   macro avg       0.58      0.59      0.58     41188
weighted avg       0.83      0.82      0.83     41188

import seaborn as sns
import matplotlib.pyplot as plt

plt.style.use("ggplot")
sns.FacetGrid(data, hue="category").map(plt.scatter, "nr.employed", "age", edgecolor="k").add_legend()
plt.show()

png

data.shape

(41188, 63)

from scipy import spatial
sample_data = data.head(41180) 
samples = data.loc[41181:41188]

%%javascript
IPython.OutputArea.prototype._should_scroll = function(lines) {
    return False;
}

<IPython.core.display.Javascript object>

frame = []
for i in range(41181, 41188): 
    t1 = samples.loc[i]
    cls = samples.loc[i]["category"]
    for j in range(41180):
        t2 = sample_data.loc[j]
        class_label = data.loc[j]["category"]
        similarity = 1 - spatial.distance.cosine(t1, t2)
        if (class_label == 1):
            frame.append([class_label, similarity, j])
        
    df = pd.DataFrame(frame, columns=['category', 'Similarity', 'Transaction ID'])
    df_sorted = df.sort_values("Similarity", ascending=False)
    print("Top 5 suspected-fraud transactions having highest similarity with transaction ID = "+str(i)+ ":")
    print(df_sorted.iloc[:5])
    print("\n")
    frame = []

Top 5 suspected-fraud transactions having highest similarity with transaction ID = 41181:
      category  Similarity  Transaction ID
     1.0    0.933260           39073
      1.0    0.933256            1837
     1.0    0.865343           35210
      1.0    0.865231            5502
      1.0    0.865045            4597


Top 5 suspected-fraud transactions having highest similarity with transaction ID = 41182:
      category  Similarity  Transaction ID
     1.0    0.901827           13014
     1.0    0.898451           29863
     1.0    0.836246           34601
     1.0    0.835903           22287
     1.0    0.835808           37988


Top 5 suspected-fraud transactions having highest similarity with transaction ID = 41183:
      category  Similarity  Transaction ID
      1.0    0.876614            1366
     1.0    0.794651           27036
      1.0    0.794483            1353
      1.0    0.793905            3079
      1.0    0.793708            1473


Top 5 suspected-fraud transactions having highest similarity with transaction ID = 41184:
      category  Similarity  Transaction ID
     1.0    0.859526           40607
     1.0    0.859427           32960
     1.0    0.858049           31552
     1.0    0.858018           40266
     1.0    0.854684           20918


Top 5 suspected-fraud transactions having highest similarity with transaction ID = 41185:
      category  Similarity  Transaction ID
     1.0    0.892698           29636
     1.0    0.892566           36207
      1.0    0.892001            8525
     1.0    0.891721           30819
       1.0    0.891262             153


Top 5 suspected-fraud transactions having highest similarity with transaction ID = 41186:
      category  Similarity  Transaction ID
     1.0    0.896340           16376
     1.0    0.896141           32785
     1.0    0.895991           40949
     1.0    0.895785           27266
       1.0    0.895512             332


Top 5 suspected-fraud transactions having highest similarity with transaction ID = 41187:
      category  Similarity  Transaction ID
     1.0    0.964525           33717
     1.0    0.893301           27606
     1.0    0.892938           10897
     1.0    0.892748           37092
     1.0    0.892707            9180

data.shape

(41188, 63)

import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from mpl_toolkits.mplot3d import Axes3D
pca = PCA(n_components=3)  #Considered only 3 components to put into 3 dimensions
to_model_cols = data.columns[0:63]
outliers = data.loc[data['category']==1]
outlier_index=list(outliers.index)
scaler = StandardScaler()
X = scaler.fit_transform(data[to_model_cols])
X_reduce = pca.fit_transform(X)
fig = plt.figure(figsize=(10,6))
ax = fig.add_subplot(111, projection='3d')
ax.set_zlabel("x_composite_3_using_PCA")
# Plotting compressed data points
ax.scatter(X_reduce[:, 0], X_reduce[:, 1], zs=X_reduce[:, 2], s=3, lw=1, label="inliers",c="green")
# Plot x for the ground truth outliers
ax.scatter(X_reduce[outlier_index,0],X_reduce[outlier_index,1], X_reduce[outlier_index,2],
           s=60, lw=2, marker="x", c="red", label="outliers")
ax.legend()
plt.show()

png

from sklearn.manifold import TSNE
standardized_data = StandardScaler().fit_transform(data)
data = standardized_data
tsne = TSNE(n_components=3, verbose=1, perplexity=40, n_iter=250)
data = tsne.fit_transform(data)

[t-SNE] Computing 121 nearest neighbors...
[t-SNE] Indexed 41188 samples in 0.007s...
[t-SNE] Computed neighbors for 41188 samples in 6.399s...
[t-SNE] Computed conditional probabilities for sample 1000 / 41188
[t-SNE] Computed conditional probabilities for sample 2000 / 41188
[t-SNE] Computed conditional probabilities for sample 3000 / 41188
[t-SNE] Computed conditional probabilities for sample 4000 / 41188
[t-SNE] Computed conditional probabilities for sample 5000 / 41188
[t-SNE] Computed conditional probabilities for sample 6000 / 41188
[t-SNE] Computed conditional probabilities for sample 7000 / 41188
[t-SNE] Computed conditional probabilities for sample 8000 / 41188
[t-SNE] Computed conditional probabilities for sample 9000 / 41188
[t-SNE] Computed conditional probabilities for sample 10000 / 41188
[t-SNE] Computed conditional probabilities for sample 11000 / 41188
[t-SNE] Computed conditional probabilities for sample 12000 / 41188
[t-SNE] Computed conditional probabilities for sample 13000 / 41188
[t-SNE] Computed conditional probabilities for sample 14000 / 41188
[t-SNE] Computed conditional probabilities for sample 15000 / 41188
[t-SNE] Computed conditional probabilities for sample 16000 / 41188
[t-SNE] Computed conditional probabilities for sample 17000 / 41188
[t-SNE] Computed conditional probabilities for sample 18000 / 41188
[t-SNE] Computed conditional probabilities for sample 19000 / 41188
[t-SNE] Computed conditional probabilities for sample 20000 / 41188
[t-SNE] Computed conditional probabilities for sample 21000 / 41188
[t-SNE] Computed conditional probabilities for sample 22000 / 41188
[t-SNE] Computed conditional probabilities for sample 23000 / 41188
[t-SNE] Computed conditional probabilities for sample 24000 / 41188
[t-SNE] Computed conditional probabilities for sample 25000 / 41188
[t-SNE] Computed conditional probabilities for sample 26000 / 41188
[t-SNE] Computed conditional probabilities for sample 27000 / 41188
[t-SNE] Computed conditional probabilities for sample 28000 / 41188
[t-SNE] Computed conditional probabilities for sample 29000 / 41188
[t-SNE] Computed conditional probabilities for sample 30000 / 41188
[t-SNE] Computed conditional probabilities for sample 31000 / 41188
[t-SNE] Computed conditional probabilities for sample 32000 / 41188
[t-SNE] Computed conditional probabilities for sample 33000 / 41188
[t-SNE] Computed conditional probabilities for sample 34000 / 41188
[t-SNE] Computed conditional probabilities for sample 35000 / 41188
[t-SNE] Computed conditional probabilities for sample 36000 / 41188
[t-SNE] Computed conditional probabilities for sample 37000 / 41188
[t-SNE] Computed conditional probabilities for sample 38000 / 41188
[t-SNE] Computed conditional probabilities for sample 39000 / 41188
[t-SNE] Computed conditional probabilities for sample 40000 / 41188
[t-SNE] Computed conditional probabilities for sample 41000 / 41188
[t-SNE] Computed conditional probabilities for sample 41188 / 41188
[t-SNE] Mean sigma: 2.229839
[t-SNE] KL divergence after 250 iterations with early exaggeration: 84.140144
[t-SNE] KL divergence after 251 iterations: 179769313486231570814527423731704356798070567525844996598917476803157260780028538760589558632766878171540458953514382464234321326889464182768467546703537516986049910576551282076245490090389328944075868508455133942304583236903222948165808559332123348274797826204144723168738177180919299881250404026184124858368.000000

fig = plt.figure(figsize=(10,6))
ax = fig.add_subplot(111, projection='3d')
ax.set_zlabel("x_composite_3_using_tSNE")
# Plotting the compressed data points
ax.scatter(data[:, 0], data[:, 1], zs=data[:, 2], s=3, lw=1, label="inliers",c="green")
# Plot x(s) for the ground truth outliers
out_index = [i for i in outlier_index if i <= 41188]
ax.scatter(data[out_index,0],data[out_index,1], data[out_index,2], lw=2, s=60, 
           marker="x", c="red", label="outliers")
ax.legend()
plt.show()

png

df_bank.columns

Index(['age', 'job=housemaid', 'job=services', 'job=admin.', 'job=blue-collar',
       'job=technician', 'job=retired', 'job=management', 'job=unemployed',
       'job=self-employed', 'job=unknown', 'job=entrepreneur', 'job=student',
       'marital=married', 'marital=single', 'marital=divorced',
       'marital=unknown', 'education=basic.4y', 'education=high.school',
       'education=basic.6y', 'education=basic.9y',
       'education=professional.course', 'education=unknown',
       'education=university.degree', 'education=illiterate', 'default=0',
       'default=unknown', 'default=1', 'housing=0', 'housing=1',
       'housing=unknown', 'loan=0', 'loan=1', 'loan=unknown',
       'contact=cellular', 'month=may', 'month=jun', 'month=jul', 'month=aug',
       'month=oct', 'month=nov', 'month=dec', 'month=mar', 'month=apr',
       'month=sep', 'day_of_week=mon', 'day_of_week=tue', 'day_of_week=wed',
       'day_of_week=thu', 'day_of_week=fri', 'duration', 'campaign', 'pdays',
       'previous', 'poutcome=nonexistent', 'poutcome=failure',
       'poutcome=success', 'emp.var.rate', 'cons.price.idx', 'cons.conf.idx',
       'euribor3m', 'nr.employed', 'category'],
      dtype='object')

df_bank_part1 = df_bank[[ 'default=1', 'housing=0', 'housing=1',
       'housing=unknown', 'loan=0', 'loan=1', 'loan=unknown',
       'contact=cellular', 'duration', 'campaign', 'pdays',
       'previous', 'poutcome=nonexistent', 'poutcome=failure',
       'poutcome=success', 'emp.var.rate', 'cons.price.idx', 'cons.conf.idx',
       'euribor3m', 'nr.employed', 'category']]

plt.figure(figsize=(15,15))
sns.heatmap(df_bank_part1.corr().round(2), annot=True, cmap='YlGnBu')
#plt.show()

<Axes: >

png

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import tensorflow as tf

from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.model_selection import train_test_split
from tensorflow.keras import layers, losses
from tensorflow.keras.datasets import fashion_mnist
from tensorflow.keras.models import Model

df_values = df_bank.values

# The last element contains the labels
labels = df_values[:, -1]

# The other data points are the features
data = df_values[:, 0:-1]

train_data, test_data, train_labels, test_labels = train_test_split(data, labels, test_size=0.2, random_state=21)

print(sum(labels==1))
print(sum(labels==0))

4640
36548

Rescale the data into the range 0 to 1

Convert it to tensorflow data, 32 bit float

Tensor flow has it’s own data types- storage control

Note the scaling uisng only the range of the train data, not test data

min_val = tf.reduce_min(train_data)
max_val = tf.reduce_max(train_data)

train_data = (train_data - min_val) / (max_val - min_val)
test_data = (test_data - min_val) / (max_val - min_val)

train_data = tf.cast(train_data, tf.float32)
test_data = tf.cast(test_data, tf.float32)

Split into normal and anomalous data sets

train_labels = train_labels.astype(bool)
test_labels = test_labels.astype(bool)

normal_train_data = train_data[train_labels]
normal_test_data = test_data[test_labels]

anomalous_train_data = train_data[~train_labels]
anomalous_test_data = test_data[~test_labels]

Plot a couple of normal and anomalous ECG patterns

#change hard encoded number to reference the amount of columns
plt.grid()
plt.plot(np.arange(62), normal_train_data[0])
plt.title("Normal Data")
plt.show()

png

plt.grid()
plt.plot(np.arange(62), anomalous_train_data[0])
plt.title("Anomalous Data")
plt.show()

png

#decoders last layer needs to be numbver of columns
class AnomalyDetector(Model):
  def __init__(self):
    super(AnomalyDetector, self).__init__()
    self.encoder = tf.keras.Sequential([
      layers.Dense(32, activation="relu"),
      layers.Dense(16, activation="relu"),
      layers.Dense(8, activation="relu")])

    self.decoder = tf.keras.Sequential([
      layers.Dense(16, activation="relu"),
      layers.Dense(32, activation="relu"),
      layers.Dense(62, activation="sigmoid")])

  def call(self, x):
    encoded = self.encoder(x)
    decoded = self.decoder(encoded)
    return decoded

autoencoder = AnomalyDetector()

autoencoder.compile(optimizer='adam', loss='mae')

history = autoencoder.fit(normal_train_data, normal_train_data, 
          epochs=20, 
          batch_size=512,
          validation_data=(test_data, test_data),
          shuffle=True)

Epoch 1/20
8/8 [==============================] - 7s 32ms/step - loss: 0.4746 - val_loss: 0.4734
Epoch 2/20
8/8 [==============================] - 0s 11ms/step - loss: 0.4692 - val_loss: 0.4651
Epoch 3/20
8/8 [==============================] - 0s 10ms/step - loss: 0.4590 - val_loss: 0.4479
Epoch 4/20
8/8 [==============================] - 0s 10ms/step - loss: 0.4375 - val_loss: 0.4138
Epoch 5/20
8/8 [==============================] - 0s 10ms/step - loss: 0.3949 - val_loss: 0.3528
Epoch 6/20
8/8 [==============================] - 0s 11ms/step - loss: 0.3245 - val_loss: 0.2732
Epoch 7/20
8/8 [==============================] - 0s 11ms/step - loss: 0.2456 - val_loss: 0.2091
Epoch 8/20
8/8 [==============================] - 0s 10ms/step - loss: 0.1917 - val_loss: 0.1733
Epoch 9/20
8/8 [==============================] - 0s 10ms/step - loss: 0.1639 - val_loss: 0.1551
Epoch 10/20
8/8 [==============================] - 0s 11ms/step - loss: 0.1503 - val_loss: 0.1500
Epoch 11/20
8/8 [==============================] - 0s 10ms/step - loss: 0.1466 - val_loss: 0.1483
Epoch 12/20
8/8 [==============================] - 0s 10ms/step - loss: 0.1456 - val_loss: 0.1478
Epoch 13/20
8/8 [==============================] - 0s 10ms/step - loss: 0.1452 - val_loss: 0.1474
Epoch 14/20
8/8 [==============================] - 0s 10ms/step - loss: 0.1447 - val_loss: 0.1454
Epoch 15/20
8/8 [==============================] - 0s 10ms/step - loss: 0.1444 - val_loss: 0.1453
Epoch 16/20
8/8 [==============================] - 0s 10ms/step - loss: 0.1442 - val_loss: 0.1454
Epoch 17/20
8/8 [==============================] - 0s 10ms/step - loss: 0.1438 - val_loss: 0.1443
Epoch 18/20
8/8 [==============================] - 0s 11ms/step - loss: 0.1433 - val_loss: 0.1442
Epoch 19/20
8/8 [==============================] - 0s 10ms/step - loss: 0.1427 - val_loss: 0.1443
Epoch 20/20
8/8 [==============================] - 0s 10ms/step - loss: 0.1422 - val_loss: 0.1426

plt.plot(history.history["loss"], label="Training Loss")
plt.plot(history.history["val_loss"], label="Validation Loss")
plt.legend()

<matplotlib.legend.Legend at 0x7f1594f9f2b0>

png

encoded_data = autoencoder.encoder(normal_test_data).numpy()
decoded_data = autoencoder.decoder(encoded_data).numpy()

plt.plot(normal_test_data[0], 'b')
plt.plot(decoded_data[0], 'r')
plt.fill_between(np.arange(62), decoded_data[0], normal_test_data[0], color='lightcoral')
plt.legend(labels=["Input", "Reconstruction", "Error"])
plt.show()

png

encoded_data = autoencoder.encoder(anomalous_test_data).numpy()
decoded_data = autoencoder.decoder(encoded_data).numpy()

plt.plot(anomalous_test_data[0], 'b')
plt.plot(decoded_data[0], 'r')
plt.fill_between(np.arange(62), decoded_data[0], anomalous_test_data[0], color='lightcoral')
plt.legend(labels=["Input", "Reconstruction", "Error"])
plt.show()

png

reconstructions = autoencoder.predict(normal_train_data)
train_loss = tf.keras.losses.mae(reconstructions, normal_train_data)

plt.hist(train_loss[None,:], bins=50)
plt.xlabel("Train loss")
plt.ylabel("No of examples")
plt.show()

118/118 [==============================] - 0s 1ms/step

png

threshold = np.mean(train_loss) + np.std(train_loss)
print("Threshold: ", threshold)

Threshold:  0.17724909

reconstructions = autoencoder.predict(anomalous_test_data)
test_loss = tf.keras.losses.mae(reconstructions, anomalous_test_data)

plt.hist(test_loss[None, :], bins=50)
plt.xlabel("Test loss")
plt.ylabel("No of examples")
plt.show()

230/230 [==============================] - 0s 1ms/step

png

plt.hist(train_loss[None,:], bins=50,color="blue")

plt.hist(test_loss[None, :], bins=50,color='red')

plt.plot([threshold,threshold],[0,350],linestyle=":")

[<matplotlib.lines.Line2D at 0x7f1120282ef0>]

png

def predict(model, data, threshold):
  reconstructions = model(data)
  loss = tf.keras.losses.mae(reconstructions, data)
  return tf.math.less(loss, threshold)

def print_stats(predictions, labels):
  print("Accuracy = {}".format(accuracy_score(labels, predictions)))
  print("Precision = {}".format(precision_score(labels, predictions)))
  print("Recall = {}".format(recall_score(labels, predictions)))

preds = predict(autoencoder, test_data, threshold)
print_stats(preds, test_labels)

Accuracy = 0.2391357125515902
Precision = 0.10843900306077832
Recall = 0.8312849162011173

from sklearn.metrics import confusion_matrix

sns.set_context("poster", font_scale = .75)
cm = confusion_matrix(test_labels, preds)
fig, ax = plt.subplots(figsize=(6, 5))
ax.imshow(cm)
ax.grid(False)
ax.xaxis.set(ticks=(0, 1), ticklabels=('Predicted 0s', 'Predicted 1s'))
ax.yaxis.set(ticks=(0, 1), ticklabels=('Actual 0s', 'Actual 1s'))
ax.set_ylim(1.5, -0.5)
for i in range(2):
    for j in range(2):
        ax.text(j, i, cm[i, j], ha='center', va='center', color='red')
plt.show()

png

← Previous Post Next Post →

Bank Fraud Anomaly Detection

Anomaly Detection On UCI Bank Data Using SVM & Neural Networks

SVM Model