I guess I will get complacent and stagnate: XGBoost Analysis Code

#Imports
import pandas as pd
import os
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import cohen_kappa_score
from scipy.stats import mode

from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import train_test_split

import xgboost as xgb
from xgboost import XGBClassifier
from xgboost import plot_importance
from matplotlib import pyplot
import shap
import warnings warnings.filterwarnings('ignore')

#Import dataset
data = 'C:/datasets/Wholesale customers data.csv'
df = pd.read_csv(data)

#Exploring the dataset
df.shape
df.head()
df.info()
df.describe()

#missing value check
df.isnull().sum()

#Checking for types of values

df.status_value_counts()

#declaring dependent and independent variables
X = df.drop('Channel', axis=1) y = df['Channel']

#Var checks
X.head()

y.head()

#Label encoding (OHE)
X_features=X

encoded_df = pd.get_dummies(df[X_features], drop_first = True)

#Checking the columns created after encoding

list(encoded_df.columns)

#Null imputation

#Performing Logistic Regression

import statmodels.api as sm

logit=sm.Logit(y_train,X_train)

logit_model=logit.fit()

#Model Summary

logit_model.summary2()

def get_significant_vars(lm):

#Store the p-value and corresponding column names in a dataframe

var_p_vals_df=pd.DataFrame(lm.pvalues)

var_p_vals_df['vars'] = var_p_vals_df.index

var_p_vals_df.columns = ['pvals','vars]

#Filter the column names where the p value is less than 0/05

return list (var_p_vals_df[var_p_vals_df.pvals<0.05]['vars']

significant_vars =get_significiant_vars(logit_model)

significant_vars

# import XGBoost
#import xgboost as xgb
# define data_dmatrix
data_dmatrix = xgb.DMatrix(data=X,label=y)

# split X and y into training and testing sets
#from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0)
# import XGBClassifier
#from xgboost import XGBClassifier

# declare parameters
params = {
'objective':'binary:logistic',
'max_depth': 4,
'alpha': 10,
'learning_rate': 1.0,
'n_estimators':100
}



# instantiate the classifier
xgb_clf = XGBClassifier(**params)

# fit the classifier to the training data
xgb_clf.fit(X_train, y_train)
#output:
XGBClassifier(alpha=10, base_score=0.5, booster='gbtree', colsample_bylevel=1,
colsample_bynode=1, colsample_bytree=1, gamma=0, learning_rate=1.0,
max_delta_step=0, max_depth=4, min_child_weight=1, missing=None,
n_estimators=100, n_jobs=1, nthread=None,
objective='binary:logistic', random_state=0, reg_alpha=0,
reg_lambda=1, scale_pos_weight=1, seed=None, silent=None,
subsample=1, verbosity=1)
# alternatively view the parameters of the xgb trained model
print(xgb_clf)
XGBClassifier(alpha=10, base_score=0.5, booster='gbtree', colsample_bylevel=1,
colsample_bynode=1, colsample_bytree=1, gamma=0, learning_rate=1.0,
max_delta_step=0, max_depth=4, min_child_weight=1, missing=None,
n_estimators=100, n_jobs=1, nthread=None,
objective='binary:logistic', random_state=0, reg_alpha=0,
reg_lambda=1, scale_pos_weight=1, seed=None, silent=None,
subsample=1, verbosity=1)
# make predictions on test data
y_pred = xgb_clf.predict(X_test)
# check accuracy score
from sklearn.metrics import accuracy_score
print('XGBoost model accuracy score: {0:0.4f}'. format(accuracy_score(y_test, y_pred)))

https://gist.github.com/pb111/cc341409081dffa5e9eaf60d79562a03

https://medium.com/@rithpansanga/optimizing-xgboost-a-guide-to-hyperparameter-tuning-77b6e48e289d

https://medium.com/@sadafsaleem5815/neural-networks-in-10mins-simply-explained-9ec2ad9ea815

https://www.kaggle.com/code/prashant111/a-guide-on-xgboost-hyperparameters-tuning/notebook

https://www.analyticsvidhya.com/blog/2016/03/complete-guide-parameter-tuning-xgboost-with-codes-python/

XGBoost Model Documentation: https://xgboost.readthedocs.io/en/latest/parameter.html#general-parameters

Remember: It is important to set a subsample value for our exercise since the dataset is imbalanced (https://xgboosting.com/configure-xgboost-subsample-parameter/)

Practical example on tuning: https://www.analyticsvidhya.com/blog/2016/03/complete-guide-parameter-tuning-xgboost-with-codes-python/

Sklearn metrics for accuracy: https://scikit-learn.org/stable/modules/generated/sklearn.metrics.auc.html

-----------------

Update on 12.05.25:

#Installs

!pip install xgboost

!pip install shap

!pip install statsmodel

#Imports

import pandas as pd

import os

import numpy as np

import matplotlib.pyplot as plt

import seaborn as sns

from sklearn.preprocessing import LabelEncoder

from sklearn.model_selection import StratifiedKFold

from sklearn.metrics import cohen_kappa_score

from scipy.stats import mode

from sklearn.feature_selection import SelectFromModel

from sklearn.model_selection import train_test_split

import statsmodels.api as sm

import xgboost as xgb

from xgboost import XGBClassifier

from xgboost import plot_importance

from matplotlib import pyplot

import shap

import warnings

warnings.filterwarnings('ignore')

#Import dataset

df = pd.read_csv('C.csv')

#Exploring the dataset

df.shape

df.head()

df.info()

df.describe()

#missing value check

df.isnull().sum()

#Checking for types of values

df.value_counts()

#declaring dependent and independent variables

X = df.drop('status', axis=1)

y = df['status']

#Label encoding (OHE)

X_features= X

encoded_df = pd.get_dummies(X_features, drop_first = True)

#note here df conversion for X-featuers was not required since it is already a dataframe

#Checking the columns created after encoding

list(encoded_df.columns)

#Sample check

#encoded_df.head()

#Var checks

#X.describe()

X.info()

encoded_df.info()

y.info()

#X.head()

#y.describe()

#y.head()

#Null Imputation

#https://www.geeksforgeeks.org/ml-handling-missing-values/

#Strategy 1

# Removing rows with missing values

df_cleaned = df.dropna()

print(df_cleaned)

#Strategy 2

#Mean, Median and Mode Imputation

mean_imputation = df['age'].fillna(df['age'].mean())

median_imputation = df['age'].fillna(df['age'].median())

mode_imputation = df['age'].fillna(df['age'].mode().iloc[0])

print("\nImputation using Mean:")

print(mean_imputation)

print("\nImputation using Median:")

print(median_imputation)

print("\nImputation using Mode:")

print(mode_imputation)

-----------------

# split X and y into training and testing sets

#from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(encoded_df, y, test_size = 0.3, random_state = 0)

#dont forget to use the X Df which has label encoding

#Performing Logistic Regression

#import statmodels.api as sm

#df.convert_objects(convert_numeric=True)

#Encoded DF - encoded_df

logit=sm.Logit(y_train,X_train)

logit_model=logit.fit()

#Functions for converting objects to Int or FLOAT

#df.convert_objects(convert_numeric=True)

#X.astype(float)).fit() - converting type

#Model Summary

logit_model.summary2()

def get_significant_vars(lm):

#Store the p-value and corresponding column names in a dataframe

var_p_vals_df=pd.DataFrame(lm.pvalues)

var_p_vals_df['vars'] = var_p_vals_df.index

var_p_vals_df.columns = ['pvals','vars']

#Filter the column names where the p value is less than 0/05

return list (var_p_vals_df[var_p_vals_df.pvals<0.05]['vars'])

significant_vars =get_significant_vars(logit_model)

significant_vars

-------------

# import XGBoost

#import xgboost as xgb

# define data_dmatrix

data_dmatrix = xgb.DMatrix(data=encoded_df,label=y)

# declare parameters

params = {

'objective':'binary:logistic',

'max_depth': 4,

'alpha': 10,

'learning_rate': 1.0,

'n_estimators':100

}

# instantiate the classifier

xgb_clf = XGBClassifier(**params)

# fit the classifier to the training data

xgb_clf.fit(X_train, y_train)

#output:

XGBClassifier(alpha=10, base_score=0.5, booster='gbtree', colsample_bylevel=1,

colsample_bynode=1, colsample_bytree=1, gamma=0, learning_rate=1.0,

max_delta_step=0, max_depth=4, min_child_weight=1, missing=None,

n_estimators=100, n_jobs=1, nthread=None,

objective='binary:logistic', random_state=0, reg_alpha=0,

reg_lambda=1, scale_pos_weight=1, seed=None, silent=None,

subsample=1, verbosity=1)

# make predictions on test data

y_pred = xgb_clf.predict(X_test)

# check accuracy score

from sklearn.metrics import accuracy_score

print('XGBoost model accuracy score: {0:0.4f}'. format(accuracy_score(y_test, y_pred)))

-------------------------------------------------------------------------------

Few things to remember:

Sampling - We never actually stop to check how the data distribution actually is, we just assume normality.

Question I have on this:

Distribution of which variables specifically?

Isn't fraud by definition a rare event which would make the distribution skewed anyways?

Log normal vs Normal distribution: https://towardsdatascience.com/log-link-vs-log-transformation-in-r-the-difference-that-misleads-your-entire-data-analysis/ - very good article

I guess I will get complacent and stagnate

Sunday, April 27, 2025

XGBoost Analysis Code

No comments:

Post a Comment

Spitballing: Scenic Route Mapping Problem

Report Abuse