#Imports
import pandas as pd
import os
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import cohen_kappa_score
from scipy.stats import mode
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import train_test_split
import xgboost as xgb
from xgboost import XGBClassifier
from xgboost import plot_importance
from matplotlib import pyplot
import shap
import warnings warnings.filterwarnings('ignore')
#Import dataset
data = 'C:/datasets/Wholesale customers data.csv'
df = pd.read_csv(data)
#Exploring the dataset
df.shape
df.head()
df.info()
df.describe()
#missing value check
df.isnull().sum()
#Checking for types of values
df.status_value_counts()
#declaring dependent and independent variables
X = df.drop('Channel', axis=1) y = df['Channel']
#Var checks
X.head()
y.head()
#Label encoding (OHE)
X_features=X
encoded_df = pd.get_dummies(df[X_features], drop_first = True)
#Checking the columns created after encoding
list(encoded_df.columns)
#Null imputation
#Performing Logistic Regression
import statmodels.api as sm
logit=sm.Logit(y_train,X_train)
logit_model=logit.fit()
#Model Summary
logit_model.summary2()
def get_significant_vars(lm):
#Store the p-value and corresponding column names in a dataframe
var_p_vals_df=pd.DataFrame(lm.pvalues)
var_p_vals_df['vars'] = var_p_vals_df.index
var_p_vals_df.columns = ['pvals','vars]
#Filter the column names where the p value is less than 0/05
return list (var_p_vals_df[var_p_vals_df.pvals<0.05]['vars']
significant_vars =get_significiant_vars(logit_model)
significant_vars
# import XGBoost
#import xgboost as xgb
# define data_dmatrix
data_dmatrix = xgb.DMatrix(data=X,label=y)
# split X and y into training and testing sets
#from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0)
# import XGBClassifier
#from xgboost import XGBClassifier
# declare parameters
params = {
'objective':'binary:logistic',
'max_depth': 4,
'alpha': 10,
'learning_rate': 1.0,
'n_estimators':100
}
# instantiate the classifier
xgb_clf = XGBClassifier(**params)
# fit the classifier to the training data
xgb_clf.fit(X_train, y_train)
#output:
XGBClassifier(alpha=10, base_score=0.5, booster='gbtree', colsample_bylevel=1,
colsample_bynode=1, colsample_bytree=1, gamma=0, learning_rate=1.0,
max_delta_step=0, max_depth=4, min_child_weight=1, missing=None,
n_estimators=100, n_jobs=1, nthread=None,
objective='binary:logistic', random_state=0, reg_alpha=0,
reg_lambda=1, scale_pos_weight=1, seed=None, silent=None,
subsample=1, verbosity=1)
# alternatively view the parameters of the xgb trained model
print(xgb_clf)
XGBClassifier(alpha=10, base_score=0.5, booster='gbtree', colsample_bylevel=1,
colsample_bynode=1, colsample_bytree=1, gamma=0, learning_rate=1.0,
max_delta_step=0, max_depth=4, min_child_weight=1, missing=None,
n_estimators=100, n_jobs=1, nthread=None,
objective='binary:logistic', random_state=0, reg_alpha=0,
reg_lambda=1, scale_pos_weight=1, seed=None, silent=None,
subsample=1, verbosity=1)
# make predictions on test data
y_pred = xgb_clf.predict(X_test)
# check accuracy score
from sklearn.metrics import accuracy_score
print('XGBoost model accuracy score: {0:0.4f}'. format(accuracy_score(y_test, y_pred)))
https://gist.github.com/pb111/cc341409081dffa5e9eaf60d79562a03
https://medium.com/@rithpansanga/optimizing-xgboost-a-guide-to-hyperparameter-tuning-77b6e48e289d
https://medium.com/@sadafsaleem5815/neural-networks-in-10mins-simply-explained-9ec2ad9ea815
https://www.kaggle.com/code/prashant111/a-guide-on-xgboost-hyperparameters-tuning/notebook
https://www.analyticsvidhya.com/blog/2016/03/complete-guide-parameter-tuning-xgboost-with-codes-python/
XGBoost Model Documentation: https://xgboost.readthedocs.io/en/latest/parameter.html#general-parameters
Remember: It is important to set a subsample value for our exercise since the dataset is imbalanced (https://xgboosting.com/configure-xgboost-subsample-parameter/)
Practical example on tuning: https://www.analyticsvidhya.com/blog/2016/03/complete-guide-parameter-tuning-xgboost-with-codes-python/
Sklearn metrics for accuracy: https://scikit-learn.org/stable/modules/generated/sklearn.metrics.auc.html
-----------------
Update on 12.05.25:
#Installs
!pip install xgboost
!pip install shap
!pip install statsmodel
#Imports
import pandas as pd
import os
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import cohen_kappa_score
from scipy.stats import mode
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import train_test_split
import statsmodels.api as sm
import xgboost as xgb
from xgboost import XGBClassifier
from xgboost import plot_importance
from matplotlib import pyplot
import shap
import warnings
warnings.filterwarnings('ignore')
#Import dataset
df = pd.read_csv('C.csv')
#Exploring the dataset
df.shape
df.head()
df.info()
df.describe()
#missing value check
df.isnull().sum()
#Checking for types of values
df.value_counts()
#declaring dependent and independent variables
X = df.drop('status', axis=1)
y = df['status']
#Label encoding (OHE)
X_features= X
encoded_df = pd.get_dummies(X_features, drop_first = True)
#note here df conversion for X-featuers was not required since it is already a dataframe
#Checking the columns created after encoding
list(encoded_df.columns)
#Sample check
#encoded_df.head()
#Var checks
#X.describe()
X.info()
encoded_df.info()
y.info()
#X.head()
#y.describe()
#y.head()
#Null Imputation
#https://www.geeksforgeeks.org/ml-handling-missing-values/
#Strategy 1
# Removing rows with missing values
df_cleaned = df.dropna()
print(df_cleaned)
#Strategy 2
#Mean, Median and Mode Imputation
mean_imputation = df['age'].fillna(df['age'].mean())
median_imputation = df['age'].fillna(df['age'].median())
mode_imputation = df['age'].fillna(df['age'].mode().iloc[0])
print("\nImputation using Mean:")
print(mean_imputation)
print("\nImputation using Median:")
print(median_imputation)
print("\nImputation using Mode:")
print(mode_imputation)
-----------------
# split X and y into training and testing sets
#from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(encoded_df, y, test_size = 0.3, random_state = 0)
#dont forget to use the X Df which has label encoding
#Performing Logistic Regression
#import statmodels.api as sm
#df.convert_objects(convert_numeric=True)
#Encoded DF - encoded_df
logit=sm.Logit(y_train,X_train)
logit_model=logit.fit()
#Functions for converting objects to Int or FLOAT
#df.convert_objects(convert_numeric=True)
#X.astype(float)).fit() - converting type
#Model Summary
logit_model.summary2()
def get_significant_vars(lm):
#Store the p-value and corresponding column names in a dataframe
var_p_vals_df=pd.DataFrame(lm.pvalues)
var_p_vals_df['vars'] = var_p_vals_df.index
var_p_vals_df.columns = ['pvals','vars']
#Filter the column names where the p value is less than 0/05
return list (var_p_vals_df[var_p_vals_df.pvals<0.05]['vars'])
significant_vars =get_significant_vars(logit_model)
significant_vars
-------------
# import XGBoost
#import xgboost as xgb
# define data_dmatrix
data_dmatrix = xgb.DMatrix(data=encoded_df,label=y)
# declare parameters
params = {
'objective':'binary:logistic',
'max_depth': 4,
'alpha': 10,
'learning_rate': 1.0,
'n_estimators':100
}
# instantiate the classifier
xgb_clf = XGBClassifier(**params)
# fit the classifier to the training data
xgb_clf.fit(X_train, y_train)
#output:
XGBClassifier(alpha=10, base_score=0.5, booster='gbtree', colsample_bylevel=1,
colsample_bynode=1, colsample_bytree=1, gamma=0, learning_rate=1.0,
max_delta_step=0, max_depth=4, min_child_weight=1, missing=None,
n_estimators=100, n_jobs=1, nthread=None,
objective='binary:logistic', random_state=0, reg_alpha=0,
reg_lambda=1, scale_pos_weight=1, seed=None, silent=None,
subsample=1, verbosity=1)
# make predictions on test data
y_pred = xgb_clf.predict(X_test)
# check accuracy score
from sklearn.metrics import accuracy_score
print('XGBoost model accuracy score: {0:0.4f}'. format(accuracy_score(y_test, y_pred)))
-------------------------------------------------------------------------------
Few things to remember:
Sampling - We never actually stop to check how the data distribution actually is, we just assume normality.
Question I have on this:
Distribution of which variables specifically?
Isn't fraud by definition a rare event which would make the distribution skewed anyways?
Log normal vs Normal distribution: https://towardsdatascience.com/log-link-vs-log-transformation-in-r-the-difference-that-misleads-your-entire-data-analysis/ - very good article