Credit Card Fraud Detection Project - COMP 379/479 Machine Learning
Ololade Akinsanola, Oliver Schramm, Eric Spencer, Tigist Tefera, and Avery Walker# Data Download
[#1]
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import os
from imblearn.over_sampling import SMOTE
from collections import Counter
import xgboost
[#2]
# 1. Install Kaggle CLI
!pip install kaggle
# 2. Write your credentials to ~/.kaggle/kaggle.json
import os, json
# Ensure the directory exists
os.makedirs(os.path.expanduser("~/.kaggle"), exist_ok=True)
creds = {
"username": "ericspencer00",
"key": "xxxxxxxxxxxxxxxxx"
}
# Write and secure the file
with open(os.path.expanduser("~/.kaggle/kaggle.json"), "w") as f:
json.dump(creds, f)
os.chmod(os.path.expanduser("~/.kaggle/kaggle.json"), 0o600)
# 3. Point Kaggle CLI at that folder
os.environ["KAGGLE_CONFIG_DIR"] = os.path.expanduser("~/.kaggle")
# 4. Download & unzip the dataset
!kaggle datasets download -d mlg-ulb/creditcardfraud -p . --unzip
# 5. Load into pandas
import pandas as pd
data = pd.read_csv("creditcard.csv")
print("Data downloaded successfully.")
print(data.head())
Output:
Basic Info + Data Visualizations
[#3]
print(data.info())
print(data.head())
# Check the class distribution
print(data["Class"].value_counts()) # 0 = Non-fraud, 1 = Fraud
Output: Basic info:
- There are some null values we account for later
- SMOTE will be useful for classification
- V1-V28 are anonymized PCA components, we can use feature importance to see which ones matter most
[#4]
sns.countplot(x='Class', data=data)
print(data["Class"].value_counts())
plt.title("Fraud vs Non-Fraud Distribution")
plt.show()
Output:
[#5]
sns.boxplot(x='Class', y='Amount', data=data)
plt.title("Transaction Amount by Fraud Class")
plt.show()
Output: This shows we can potentially eliminate any classifications for fraud past ~2000 transactions
[#6]
# Create a heatmap to visualize the distribution
plt.figure(figsize=(10, 6))
sns.heatmap(pd.crosstab(data['Class'], pd.cut(data['Amount'], bins=10)), annot=True, fmt='d', cmap='Blues')
plt.title('Distribution of Fraud vs. Non-Fraud by Transaction Amount')
plt.xlabel('Transaction Amount Bins')
plt.ylabel('Fraud Class (0: Non-Fraud, 1: Fraud)')
plt.show()
Output: Distribution of transaction amount
[#7]
plt.figure(figsize=(10,5))
sns.histplot(data["Amount"], bins=50, kde=True)
plt.title("Distribution of Transaction Amounts")
plt.xlabel("Transaction Amount ($)")
plt.ylabel("Frequency")
plt.show()
Output: A majority of the transactions are very low (close to zero) There are very few high-value transactions. Fraudulent transactions usually involve very high-values or unusual amounts of purchases.Features Correlation plot
[#8]
plt.figure(figsize=(12,6))
sns.heatmap(data.corr(), cmap="coolwarm", annot=False)
plt.title("Feature Correlation Heatmap")
plt.show()
Output: This graph represents features that have a higher correlation marked red, lower correlation marked light blue and negative correlation marked blue
[#9]
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
features = [f'V{i}' for i in range(1, 29)]
X = data[features]
y = data['Class']
# drop all NaN values
X = X.dropna()
y = y.dropna()
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X)
plt.figure(figsize=(10, 6))
sns.scatterplot(x=X_pca[:, 0], y=X_pca[:, 1], hue=y, alpha=0.5)
plt.title('PCA Projection of V1–V28')
plt.xlabel('PCA Component 1')
plt.ylabel('PCA Component 2')
plt.show()
Output:
[#10]
# Visualize how this PCA matches up with our $ values
amount_values = data.loc[X.index, 'Amount']
# Create a scatter plot
plt.figure(figsize=(10, 6))
sns.scatterplot(x=X_pca[:, 0], y=X_pca[:, 1], hue=amount_values, alpha=0.5)
plt.title('PCA Projection of V1–V28')
plt.xlabel('PCA Component 1')
plt.ylabel('PCA Component 2')
plt.show()
Output: Consider using t-SNE or UMAP…
[#11]
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import seaborn as sns
# Sample to speed up t-SNE (adjust size as needed)
sampled_data = data.sample(n=5000, random_state=42)
features = [f'V{i}' for i in range(1, 29)]
X = sampled_data[features]
y = sampled_data['Class']
# Run t-SNE
tsne = TSNE(n_components=2, perplexity=30, n_iter=1000, random_state=42)
X_tsne = tsne.fit_transform(X)
# Visualize
tsne_df = pd.DataFrame({'TSNE1': X_tsne[:, 0], 'TSNE2': X_tsne[:, 1], 'Class': y.values})
plt.figure(figsize=(10, 6))
sns.scatterplot(data=tsne_df, x='TSNE1', y='TSNE2', hue='Class', alpha=0.6)
plt.title('t-SNE Projection of V1–V28')
plt.xlabel('t-SNE Component 1')
plt.ylabel('t-SNE Component 2')
plt.show()
Output:
[#12]
import umap.umap_ as umap
# Use the same sampled data
reducer = umap.UMAP(n_components=2, random_state=42)
X_umap = reducer.fit_transform(X)
# Visualize
umap_df = pd.DataFrame({'UMAP1': X_umap[:, 0], 'UMAP2': X_umap[:, 1], 'Class': y.values})
plt.figure(figsize=(10, 6))
sns.scatterplot(data=umap_df, x='UMAP1', y='UMAP2', hue='Class', alpha=0.6)
plt.title('UMAP Projection of V1–V28')
plt.xlabel('UMAP Component 1')
plt.ylabel('UMAP Component 2')
plt.show()
Output: I don’t believe either of these will be as helpful.# Split data
[#13]
# split the data between test and train, drop all NaN values
X = data.drop('Class', axis=1)
y = data['Class']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# X = data.drop('Class', axis=1)
# y = data['Class']
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(X.isna().sum())
print(y.isna().sum())
Output:
Oversampling the imbalanced fraud data using SMOTE
[#14]
smote = SMOTE(random_state=42)
# Drop all NaN values
X_train = X_train.dropna()
y_train = y_train.dropna()
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)
# Print class distribution
print("Class distribution before SMOTE:", Counter(y_train))
print("Class distribution after SMOTE:", Counter(y_train_smote))
Output:
[#15]
# create a graph to visualize the new data spread
plt.figure(figsize=(10,5))
sns.countplot(x=y_train_smote)
plt.title("Fraud vs Non-Fraud Distribution ")
plt.show()
Output:
[#16]
# Create a new graph to visualize the PCA components V1-V28
features = [f'V{i}' for i in range(1, 29)]
X = X_train_smote[features]
y = y_train_smote
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X)
plt.figure(figsize=(10, 6))
sns.scatterplot(x=X_pca[:, 0], y=X_pca[:, 1], hue=y, alpha=0.5)
plt.title('PCA Projection of V1–V28')
plt.xlabel('PCA Component 1')
plt.ylabel('PCA Component 2')
plt.show()
Output:
Normalization
[#17]
# data Standardization
scaler = StandardScaler()
X_train_smote_scaled = scaler.fit_transform(X_train_smote)
X_test_scaled = scaler.transform(X_test)
[#18]
# Create a new graph after normalizing
features = [f'V{i}' for i in range(1, 29)]
X = X_train_smote_scaled
y = y_train_smote
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X)
plt.figure(figsize=(10, 6))
sns.scatterplot(x=X_pca[:, 0], y=X_pca[:, 1], hue=y, alpha=0.5)
plt.title('PCA Projection of V1–V28')
plt.xlabel('PCA Component 1')
plt.ylabel('PCA Component 2')
plt.show()
Output:
[#19]
# Create new graphs that show individual data
features = [f'V{i}' for i in range(1, 29)]
X = X_train_smote_scaled
y = y_train_smote
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X)
# --- Graph for Class 0 ---
plt.figure(figsize=(10, 6))
sns.scatterplot(x=X_pca[y == 0, 0], y=X_pca[y == 0, 1], alpha=0.5) # Filter data for class 0
plt.title('PCA Projection of V1–V28 (Class 0)')
plt.xlabel('PCA Component 1')
plt.ylabel('PCA Component 2')
plt.show()
# --- Graph for Class 1 ---
plt.figure(figsize=(10, 6))
sns.scatterplot(x=X_pca[y == 1, 0], y=X_pca[y == 1, 1], alpha=0.5, color='orange') # Filter data for class 1
plt.title('PCA Projection of V1–V28 (Class 1)')
plt.xlabel('PCA Component 1')
plt.ylabel('PCA Component 2')
plt.show()
Output:
Training Models
Logistic Regression
Random Forest
XGBoost
SVM (RBF kernel)
KNN## Logistic Regression
[#20]
# Using a Logistic Regression model we will train the data points for classificaiton
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score, roc_curve, ConfusionMatrixDisplay, precision_recall_curve, auc
# train
model = LogisticRegression()
model.fit(X_train_smote_scaled, y_train_smote)
# predict
y_pred = model.predict(X_test_scaled)
# print results
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
cm = confusion_matrix(y_test, y_pred, labels=model.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=model.classes_)
disp.plot()
plt.show()
print("Classification Report:\n", classification_report(y_test, y_pred))
Output:
[#21]
# ROC and PR auc
print("ROC AUC:", roc_auc_score(y_test, y_pred))
precision, recall, thresholds = precision_recall_curve(y_test, y_pred)
pr_auc = auc(recall, precision)
print("PR AUC:", pr_auc)
Output: The logistic regression model is pretty accurate, with an accuracy score of 0.99 and a recall accuracy of 0.86. There are only 22 improper classifications with this model, as opposed to 8315 proper classifications.## Random Forest
[#22]
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
# train
model = RandomForestClassifier()
model.fit(X_train_smote_scaled, y_train_smote)
Output:
RandomForestClassifier()
[#23]
# predict
y_pred_rf = model.predict(X_test_scaled)
# print results
print("Accuracy:", accuracy_score(y_test, y_pred_rf))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_rf))
cm = confusion_matrix(y_test, y_pred_rf, labels=model.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=model.classes_)
disp.plot()
plt.show()
print("Classification Report:\n", classification_report(y_test, y_pred_rf))
Output:
[#24]
# ROC and PR auc
print("ROC AUC:", roc_auc_score(y_test, y_pred_rf))
precision, recall, thresholds = precision_recall_curve(y_test, y_pred_rf)
pr_auc = auc(recall, precision)
print("PR AUC:", pr_auc)
Output: The Random Forest model is even more accurate than the logistic regression model, with an accuracy score of 0.998## Support Vector Machine
[#25]
# Use an SVM to predict fraud in addition to SMOTE
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
# train
model = SVC(class_weight='balanced')
model.fit(X_train_smote_scaled, y_train_smote)
Output:
SVC(class_weight='balanced')
[#26]
# predict
y_pred_svm = model.predict(X_test_scaled)
# results
print("Accuracy:", accuracy_score(y_test, y_pred_svm))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_svm))
cm = confusion_matrix(y_test, y_pred_svm, labels=model.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=model.classes_)
disp.plot()
plt.show()
print("Classification Report:\n", classification_report(y_test, y_pred_svm))
Output:
[#27]
# ROC and PR auc
print("ROC AUC:", roc_auc_score(y_test, y_pred_svm))
precision, recall, thresholds = precision_recall_curve(y_test, y_pred_svm)
pr_auc = auc(recall, precision)
print("PR AUC:", pr_auc)
Output:
KNN
[#28]
# Train using a KNN on the standardized, SMOTE data
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
# Split data
X = data.drop('Class', axis=1)
y = data['Class']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Imputation using SimpleImputer for features (X)
imputer = SimpleImputer(strategy='mean') # Or other strategies like 'median', 'most_frequent'
X_train_imputed = imputer.fit_transform(X_train)
X_test_imputed = imputer.transform(X_test)
# Since 'Class' is categorical, we might impute with the most frequent value
from sklearn.impute import SimpleImputer
imputer_y = SimpleImputer(strategy='most_frequent')
y_train = imputer_y.fit_transform(y_train.values.reshape(-1, 1))
y_test = imputer_y.transform(y_test.values.reshape(-1, 1))
# Now use X_train_imputed and X_test_imputed in the KNN model:
knn_model = KNeighborsClassifier(n_neighbors=5)
knn_model.fit(X_train_imputed, y_train.ravel()) # use ravel() to avoid warning
# Make Predictions
y_pred_knn = knn_model.predict(X_test_imputed)
[#29]
# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred_knn))
# print results
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_knn))
cm = confusion_matrix(y_test, y_pred_knn, labels=knn_model.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=knn_model.classes_)
disp.plot()
plt.show()
print("Classification Report:\n", classification_report(y_test, y_pred_knn))
Output:
[#30]
# ROC and PR auc
print("ROC AUC:", roc_auc_score(y_test, y_pred_knn))
precision, recall, thresholds = precision_recall_curve(y_test, y_pred_knn)
pr_auc = auc(recall, precision)
print("PR AUC:", pr_auc)
Output:
XGBoost
[#31]
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from xgboost import XGBClassifier
# train
xg_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
xg_model.fit(X_train_smote_scaled, y_train_smote)
Output:
XGBClassifier(base_score=None, booster=None, callbacks=None,
colsample_bylevel=None, colsample_bynode=None,
colsample_bytree=None, device=None, early_stopping_rounds=None,
enable_categorical=False, eval_metric='logloss',
feature_types=None, gamma=None, grow_policy=None,
importance_type=None, interaction_constraints=None,
learning_rate=None, max_bin=None, max_cat_threshold=None,
max_cat_to_onehot=None, max_delta_step=None, max_depth=None,
max_leaves=None, min_child_weight=None, missing=nan,
monotone_constraints=None, multi_strategy=None, n_estimators=None,
n_jobs=None, num_parallel_tree=None, random_state=None, ...)
[#32]
# predict
y_pred_xg = xg_model.predict(X_test_scaled)
# print results
print("Accuracy:", accuracy_score(y_test, y_pred_xg))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_xg))
cm = confusion_matrix(y_test, y_pred_xg, labels=xg_model.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=xg_model.classes_)
disp.plot()
plt.show()
print("Classification Report:\n", classification_report(y_test, y_pred_xg))
Output:
[#33]
# ROC and PR auc
print("ROC AUC:", roc_auc_score(y_test, y_pred_xg))
precision, recall, thresholds = precision_recall_curve(y_test, y_pred_xg)
pr_auc = auc(recall, precision)
print("PR AUC:", pr_auc)
Output: