Sunday, April 27, 2025

XGBoost Analysis Code

 #Imports
import pandas as pd
import os
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import cohen_kappa_score
from scipy.stats import mode

from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import train_test_split

import xgboost as xgb
from xgboost import XGBClassifier
from xgboost import plot_importance
from matplotlib import pyplot
import shap
import warnings warnings.filterwarnings('ignore')


#Import dataset 
data = 'C:/datasets/Wholesale customers data.csv' 
df = pd.read_csv(data)

#Exploring the dataset
df.shape
df.head()
df.info()
df.describe()

#missing value check
df.isnull().sum()

#Checking for types of values
df.status_value_counts()

#declaring dependent and independent variables
X = df.drop('Channel', axis=1) y = df['Channel']

#Var checks
X.head()

y.head()

#Label encoding (OHE)
X_features=X
encoded_df = pd.get_dummies(df[X_features], drop_first = True)

#Checking the columns created after encoding
list(encoded_df.columns)

#Null imputation


#Performing Logistic Regression
import statmodels.api as sm

logit=sm.Logit(y_train,X_train)
logit_model=logit.fit()

#Model Summary
logit_model.summary2()

def get_significant_vars(lm):
    #Store the p-value and corresponding column names in a dataframe
    var_p_vals_df=pd.DataFrame(lm.pvalues)
    var_p_vals_df['vars'] = var_p_vals_df.index
    var_p_vals_df.columns = ['pvals','vars]
    #Filter the column names where the p value is less than 0/05
    return list (var_p_vals_df[var_p_vals_df.pvals<0.05]['vars']

significant_vars =get_significiant_vars(logit_model)
significant_vars



# import XGBoost
#import xgboost as xgb 
# define data_dmatrix
data_dmatrix = xgb.DMatrix(data=X,label=y)

# split X and y into training and testing sets 
#from sklearn.model_selection import train_test_split 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0)
# import XGBClassifier
#from xgboost import XGBClassifier

# declare parameters
params = {
            'objective':'binary:logistic',
            'max_depth': 4,
            'alpha': 10,
            'learning_rate': 1.0,
            'n_estimators':100
        }
            
      
            
# instantiate the classifier 
xgb_clf = XGBClassifier(**params)


# fit the classifier to the training data
xgb_clf.fit(X_train, y_train)
#output:
XGBClassifier(alpha=10, base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=1, gamma=0, learning_rate=1.0,
       max_delta_step=0, max_depth=4, min_child_weight=1, missing=None,
       n_estimators=100, n_jobs=1, nthread=None,
       objective='binary:logistic', random_state=0, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, seed=None, silent=None,
       subsample=1, verbosity=1)
# alternatively view the parameters of the xgb trained model
print(xgb_clf)
XGBClassifier(alpha=10, base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=1, gamma=0, learning_rate=1.0,
       max_delta_step=0, max_depth=4, min_child_weight=1, missing=None,
       n_estimators=100, n_jobs=1, nthread=None,
       objective='binary:logistic', random_state=0, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, seed=None, silent=None,
       subsample=1, verbosity=1)
# make predictions on test data
y_pred = xgb_clf.predict(X_test)
# check accuracy score
from sklearn.metrics import accuracy_score
print('XGBoost model accuracy score: {0:0.4f}'. format(accuracy_score(y_test, y_pred)))



https://gist.github.com/pb111/cc341409081dffa5e9eaf60d79562a03

https://medium.com/@rithpansanga/optimizing-xgboost-a-guide-to-hyperparameter-tuning-77b6e48e289d
https://medium.com/@sadafsaleem5815/neural-networks-in-10mins-simply-explained-9ec2ad9ea815

https://www.kaggle.com/code/prashant111/a-guide-on-xgboost-hyperparameters-tuning/notebook

https://www.analyticsvidhya.com/blog/2016/03/complete-guide-parameter-tuning-xgboost-with-codes-python/

XGBoost Model Documentation: https://xgboost.readthedocs.io/en/latest/parameter.html#general-parameters

Remember: It is important to set a subsample value for our exercise since the dataset is imbalanced (https://xgboosting.com/configure-xgboost-subsample-parameter/)

Practical example on tuning: https://www.analyticsvidhya.com/blog/2016/03/complete-guide-parameter-tuning-xgboost-with-codes-python/

Sklearn metrics for accuracy: https://scikit-learn.org/stable/modules/generated/sklearn.metrics.auc.html





-----------------
Update on 12.05.25:

#Installs
!pip install xgboost
!pip install shap
!pip install statsmodel


#Imports
import pandas as pd
import os
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import cohen_kappa_score
from scipy.stats import mode
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import train_test_split
import statsmodels.api as sm

import xgboost as xgb
from xgboost import XGBClassifier
from xgboost import plot_importance
from matplotlib import pyplot
import shap
import warnings 
warnings.filterwarnings('ignore')


#Import dataset 

df = pd.read_csv('C.csv')

#Exploring the dataset
df.shape
df.head()
df.info()
df.describe()

#missing value check
df.isnull().sum()

#Checking for types of values
df.value_counts()


#declaring dependent and independent variables
X = df.drop('status', axis=1) 
y = df['status']


#Label encoding (OHE)
X_features= X

encoded_df = pd.get_dummies(X_features, drop_first = True)
#note here df conversion for X-featuers was not required since it is already a dataframe

#Checking the columns created after encoding
list(encoded_df.columns)
#Sample check
#encoded_df.head()


#Var checks
#X.describe()
X.info()
encoded_df.info()
y.info()
#X.head()
#y.describe()
#y.head()


#Null Imputation
#https://www.geeksforgeeks.org/ml-handling-missing-values/

#Strategy 1
# Removing rows with missing values
df_cleaned = df.dropna()
print(df_cleaned)

#Strategy 2
#Mean, Median and Mode Imputation

mean_imputation = df['age'].fillna(df['age'].mean())
median_imputation = df['age'].fillna(df['age'].median())
mode_imputation = df['age'].fillna(df['age'].mode().iloc[0])

print("\nImputation using Mean:")
print(mean_imputation)

print("\nImputation using Median:")
print(median_imputation)

print("\nImputation using Mode:")
print(mode_imputation)


-----------------
# split X and y into training and testing sets 
#from sklearn.model_selection import train_test_split 
X_train, X_test, y_train, y_test = train_test_split(encoded_df, y, test_size = 0.3, random_state = 0)

#dont forget to use the X Df which has label encoding

#Performing Logistic Regression
#import statmodels.api as sm


#df.convert_objects(convert_numeric=True)

#Encoded DF - encoded_df

logit=sm.Logit(y_train,X_train)
logit_model=logit.fit()


#Functions for converting objects to Int or FLOAT
#df.convert_objects(convert_numeric=True)
#X.astype(float)).fit() - converting type

#Model Summary
logit_model.summary2()


def get_significant_vars(lm):
    #Store the p-value and corresponding column names in a dataframe
    var_p_vals_df=pd.DataFrame(lm.pvalues)
    var_p_vals_df['vars'] = var_p_vals_df.index
    var_p_vals_df.columns = ['pvals','vars']
    #Filter the column names where the p value is less than 0/05
    return list (var_p_vals_df[var_p_vals_df.pvals<0.05]['vars'])

significant_vars =get_significant_vars(logit_model)
significant_vars


-------------
# import XGBoost
#import xgboost as xgb 
# define data_dmatrix
data_dmatrix = xgb.DMatrix(data=encoded_df,label=y)

# declare parameters
params = {
            'objective':'binary:logistic',
            'max_depth': 4,
            'alpha': 10,
            'learning_rate': 1.0,
            'n_estimators':100
        }


# instantiate the classifier 
xgb_clf = XGBClassifier(**params)

# fit the classifier to the training data
xgb_clf.fit(X_train, y_train)
#output:
XGBClassifier(alpha=10, base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=1, gamma=0, learning_rate=1.0,
       max_delta_step=0, max_depth=4, min_child_weight=1, missing=None,
       n_estimators=100, n_jobs=1, nthread=None,
       objective='binary:logistic', random_state=0, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, seed=None, silent=None,
       subsample=1, verbosity=1)
       
       
# make predictions on test data
y_pred = xgb_clf.predict(X_test)


# check accuracy score
from sklearn.metrics import accuracy_score
print('XGBoost model accuracy score: {0:0.4f}'. format(accuracy_score(y_test, y_pred)))


-------------------------------------------------------------------------------

Few things to remember:

Sampling - We never actually stop to check how the data distribution actually is, we just assume normality.

Question I have on this: 
Distribution of which variables specifically? 
Isn't fraud by definition a rare event which would make the distribution skewed anyways? 

Log normal vs Normal distribution: https://towardsdatascience.com/log-link-vs-log-transformation-in-r-the-difference-that-misleads-your-entire-data-analysis/ - very good article 



No comments:

Post a Comment

Interesting Reads

I hope I actually read through and retain something from these very interesting reads: Complete tutorial on Python for Data Analysis:  https...