#Imports
import pandas as pd
import os
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import cohen_kappa_score
from scipy.stats import mode
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import train_test_split
import xgboost as xgb
from xgboost import XGBClassifier
from xgboost import plot_importance
from matplotlib import pyplot
import shap
import warnings warnings.filterwarnings('ignore')
import pandas as pd
import os
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import cohen_kappa_score
from scipy.stats import mode
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import train_test_split
import xgboost as xgb
from xgboost import XGBClassifier
from xgboost import plot_importance
from matplotlib import pyplot
import shap
import warnings warnings.filterwarnings('ignore')
#Import dataset
data = 'C:/datasets/Wholesale customers data.csv'
df = pd.read_csv(data)
#Exploring the dataset
df.shape
df.head()
df.info()
df.describe()
df.shape
df.head()
df.info()
df.describe()
#missing value check
df.isnull().sum()
#Checking for types of values
df.isnull().sum()
#Checking for types of values
df.status_value_counts()
#declaring dependent and independent variables
X = df.drop('Channel', axis=1) y = df['Channel']
#declaring dependent and independent variables
X = df.drop('Channel', axis=1) y = df['Channel']
#Var checks
X.head()
y.head()
#Label encoding (OHE)
X_features=X
X_features=X
encoded_df = pd.get_dummies(df[X_features], drop_first = True)
#Checking the columns created after encoding
list(encoded_df.columns)
#Null imputation
#Performing Logistic Regression
import statmodels.api as sm
logit=sm.Logit(y_train,X_train)
logit_model=logit.fit()
#Model Summary
logit_model.summary2()
def get_significant_vars(lm):
#Store the p-value and corresponding column names in a dataframe
var_p_vals_df=pd.DataFrame(lm.pvalues)
var_p_vals_df['vars'] = var_p_vals_df.index
var_p_vals_df.columns = ['pvals','vars]
#Filter the column names where the p value is less than 0/05
return list (var_p_vals_df[var_p_vals_df.pvals<0.05]['vars']
significant_vars =get_significiant_vars(logit_model)
significant_vars
# import XGBoost
#import xgboost as xgb
# define data_dmatrix
data_dmatrix = xgb.DMatrix(data=X,label=y)
# split X and y into training and testing sets
#from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0)
# import XGBClassifier
#from xgboost import XGBClassifier
# declare parameters
params = {
'objective':'binary:logistic',
'max_depth': 4,
'alpha': 10,
'learning_rate': 1.0,
'n_estimators':100
}
# instantiate the classifier
xgb_clf = XGBClassifier(**params)
# fit the classifier to the training data
xgb_clf.fit(X_train, y_train)
#output:
XGBClassifier(alpha=10, base_score=0.5, booster='gbtree', colsample_bylevel=1,
colsample_bynode=1, colsample_bytree=1, gamma=0, learning_rate=1.0,
max_delta_step=0, max_depth=4, min_child_weight=1, missing=None,
n_estimators=100, n_jobs=1, nthread=None,
objective='binary:logistic', random_state=0, reg_alpha=0,
reg_lambda=1, scale_pos_weight=1, seed=None, silent=None,
subsample=1, verbosity=1)
# alternatively view the parameters of the xgb trained model
print(xgb_clf)
XGBClassifier(alpha=10, base_score=0.5, booster='gbtree', colsample_bylevel=1,
colsample_bynode=1, colsample_bytree=1, gamma=0, learning_rate=1.0,
max_delta_step=0, max_depth=4, min_child_weight=1, missing=None,
n_estimators=100, n_jobs=1, nthread=None,
objective='binary:logistic', random_state=0, reg_alpha=0,
reg_lambda=1, scale_pos_weight=1, seed=None, silent=None,
subsample=1, verbosity=1)
# make predictions on test data
y_pred = xgb_clf.predict(X_test)
# check accuracy score
from sklearn.metrics import accuracy_score
print('XGBoost model accuracy score: {0:0.4f}'. format(accuracy_score(y_test, y_pred)))
https://gist.github.com/pb111/cc341409081dffa5e9eaf60d79562a03
https://medium.com/@rithpansanga/optimizing-xgboost-a-guide-to-hyperparameter-tuning-77b6e48e289d
https://medium.com/@sadafsaleem5815/neural-networks-in-10mins-simply-explained-9ec2ad9ea815
https://www.kaggle.com/code/prashant111/a-guide-on-xgboost-hyperparameters-tuning/notebook
https://www.analyticsvidhya.com/blog/2016/03/complete-guide-parameter-tuning-xgboost-with-codes-python/
XGBoost Model Documentation: https://xgboost.readthedocs.io/en/latest/parameter.html#general-parameters
Remember: It is important to set a subsample value for our exercise since the dataset is imbalanced (https://xgboosting.com/configure-xgboost-subsample-parameter/)
Practical example on tuning: https://www.analyticsvidhya.com/blog/2016/03/complete-guide-parameter-tuning-xgboost-with-codes-python/
Sklearn metrics for accuracy: https://scikit-learn.org/stable/modules/generated/sklearn.metrics.auc.html
-----------------
Update on 12.05.25:
#Installs
!pip install xgboost
!pip install shap
!pip install statsmodel
#Imports
import pandas as pd
import os
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import cohen_kappa_score
from scipy.stats import mode
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import train_test_split
import statsmodels.api as sm
import xgboost as xgb
from xgboost import XGBClassifier
from xgboost import plot_importance
from matplotlib import pyplot
import shap
import warnings
warnings.filterwarnings('ignore')
#Import dataset
df = pd.read_csv('C.csv')
#Exploring the dataset
df.shape
df.head()
df.info()
df.describe()
#missing value check
df.isnull().sum()
#Checking for types of values
df.value_counts()
#declaring dependent and independent variables
X = df.drop('status', axis=1)
y = df['status']
#Label encoding (OHE)
X_features= X
encoded_df = pd.get_dummies(X_features, drop_first = True)
#note here df conversion for X-featuers was not required since it is already a dataframe
#Checking the columns created after encoding
list(encoded_df.columns)
#Sample check
#encoded_df.head()
#Var checks
#X.describe()
X.info()
encoded_df.info()
y.info()
#X.head()
#y.describe()
#y.head()
#Null Imputation
#https://www.geeksforgeeks.org/ml-handling-missing-values/
#Strategy 1
# Removing rows with missing values
df_cleaned = df.dropna()
print(df_cleaned)
#Strategy 2
#Mean, Median and Mode Imputation
mean_imputation = df['age'].fillna(df['age'].mean())
median_imputation = df['age'].fillna(df['age'].median())
mode_imputation = df['age'].fillna(df['age'].mode().iloc[0])
print("\nImputation using Mean:")
print(mean_imputation)
print("\nImputation using Median:")
print(median_imputation)
print("\nImputation using Mode:")
print(mode_imputation)
-----------------
# split X and y into training and testing sets
#from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(encoded_df, y, test_size = 0.3, random_state = 0)
#dont forget to use the X Df which has label encoding
#Performing Logistic Regression
#import statmodels.api as sm
#df.convert_objects(convert_numeric=True)
#Encoded DF - encoded_df
logit=sm.Logit(y_train,X_train)
logit_model=logit.fit()
#Functions for converting objects to Int or FLOAT
#df.convert_objects(convert_numeric=True)
#X.astype(float)).fit() - converting type
#Model Summary
logit_model.summary2()
def get_significant_vars(lm):
#Store the p-value and corresponding column names in a dataframe
var_p_vals_df=pd.DataFrame(lm.pvalues)
var_p_vals_df['vars'] = var_p_vals_df.index
var_p_vals_df.columns = ['pvals','vars']
#Filter the column names where the p value is less than 0/05
return list (var_p_vals_df[var_p_vals_df.pvals<0.05]['vars'])
significant_vars =get_significant_vars(logit_model)
significant_vars
-------------
# import XGBoost
#import xgboost as xgb
# define data_dmatrix
data_dmatrix = xgb.DMatrix(data=encoded_df,label=y)
# declare parameters
params = {
'objective':'binary:logistic',
'max_depth': 4,
'alpha': 10,
'learning_rate': 1.0,
'n_estimators':100
}
# instantiate the classifier
xgb_clf = XGBClassifier(**params)
# fit the classifier to the training data
xgb_clf.fit(X_train, y_train)
#output:
XGBClassifier(alpha=10, base_score=0.5, booster='gbtree', colsample_bylevel=1,
colsample_bynode=1, colsample_bytree=1, gamma=0, learning_rate=1.0,
max_delta_step=0, max_depth=4, min_child_weight=1, missing=None,
n_estimators=100, n_jobs=1, nthread=None,
objective='binary:logistic', random_state=0, reg_alpha=0,
reg_lambda=1, scale_pos_weight=1, seed=None, silent=None,
subsample=1, verbosity=1)
# make predictions on test data
y_pred = xgb_clf.predict(X_test)
# check accuracy score
from sklearn.metrics import accuracy_score
print('XGBoost model accuracy score: {0:0.4f}'. format(accuracy_score(y_test, y_pred)))
-------------------------------------------------------------------------------
Few things to remember:
Sampling - We never actually stop to check how the data distribution actually is, we just assume normality.
Question I have on this:
Distribution of which variables specifically?
Isn't fraud by definition a rare event which would make the distribution skewed anyways?
Log normal vs Normal distribution: https://towardsdatascience.com/log-link-vs-log-transformation-in-r-the-difference-that-misleads-your-entire-data-analysis/ - very good article
No comments:
Post a Comment