Uploaded by Nishant Sharma

PFL Segmentation Script

advertisement
import snowflake.snowpark as snowpark
from snowflake.snowpark.functions import col
import logging
import sys
logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
# Importing all required modules
from datetime import datetime, timedelta
import dateutil.relativedelta
import math
import time
import os
import pandas as pd
import json
import numpy as np
from statistics import mean
import pandas as pd
from datetime import date, timedelta
from datetime import datetime, timedelta
import traceback
type_Code_mapping = None
from datetime import datetime, timedelta
import csv
import time
class Segment:
"""
Segment class to test all the rules for a loan id
which will help in classifying it into one of the 8 segments defined by collections team for the soft calling
"""
def __init__(self, _case):
self.case = _case[0]
self.calculate_mob()
self.calculate_bounce()
self.calculate_payment()
self.calculate_special()
def calculate_mob(self):
self.mob = self.case['mob']
return None
def calculate_bounce(self):
self.bounce_3M = self.case['bounce_3M']
self.bounce_6M = self.case['bounce_6M']
self.bounce_12M = self.case['bounce_12M']
return None
def calculate_payment(self):
self.avg_dpd_12M = self.case['avg_dpd_12M']
self.max_dpd_3M = self.case['max_dpd_3M']
return None
def calculate_special(self):
self.nache_registered = self.case['nache_registered']
self.gps_status = self.case['gps_status']
return None
def segment_123_mob(self):
if self.mob == 1:
_segment = 'S2' elif self.mob in [2,3]:
if self.nache_registered == 0:
_segment = 'S6'
elif self.bounce_3M == 0:
_segment = 'S2' else:
_segment = 'S6'
else:
_segment = None
return _segment
def segment_4_12_mob(self):
if (self.bounce_6M >= 4):
_segment = 'S7'
elif (self.bounce_6M > 1) and (self.bounce_6M < 4):
_segment = 'S5'
elif (self.bounce_6M == 1) and (self.bounce_3M == 1):
_segment = 'S4'
elif (self.nache_registered == 0) and (self.avg_dpd_12M <15):
_segment = 'S3'
elif (self.bounce_6M == 1) and (self.bounce_3M == 0):
_segment = 'S3'
elif self.bounce_6M == 0:
_segment = 'S1'
else:
_segment = None
return _segment
def segment_12_plus_mob(self):
if (self.bounce_12M >= 6):
_segment = 'S7'
elif (self.bounce_12M in [3,4,5]):
_segment = 'S5'
elif (self.bounce_12M in [1,2]) and (self.bounce_3M in [1,2]):
_segment = 'S4'
elif (self.nache_registered == 0) and (self.avg_dpd_12M <15):
_segment = 'S3'
elif (self.bounce_12M in [1,2]) and (self.bounce_3M == 0):
_segment = 'S3'
elif self.bounce_12M == 0: _segment = 'S1' else:
_segment = None
return _segment
def segment_special(self):
if self.avg_dpd_12M >= 15:
if (self.max_dpd_3M >= 1) or (self.gps_status != 'Active') or (self.nache_registered == 0):
_segment = 'S7'
else:
_segment = None
else:
_segment = None
return _segment
def get_segment(self):
if self.mob <= 3:
_segment = self.segment_123_mob()
elif self.mob >= 4:
_segment = self.segment_special()
if _segment is None:
if (self.mob >= 4) and (self.mob <= 12):
_segment = self.segment_4_12_mob()
elif self.mob > 12:
_segment = self.segment_12_plus_mob()
else:
_segment = None
else:
_segment = None
return _segment
def get_segment_final(self): _segment = self.get_segment()
if (self.mob in [0,1,2,3]):
_segment = 'S2'
elif (_segment == 'S6') and (self.avg_dpd_12M <= 15):
_segment = 'S2'
elif (_segment == 'S7') and (self.avg_dpd_12M <= 15):
_segment = 'S6'
elif (_segment in ['S4','S5','S6','S7']) and (self.avg_dpd_12M > 15):
_segment = 'S7'
else:
_segment = _segment
return _segment
def fetch_data(session):
#### Segmentation data query
segmentation_data = session.sql(f"""
WITH start_date AS (
SELECT DATE_TRUNC('month', current_date()) AS start_date
),
end_date AS (
SELECT last_day(current_date)+1 AS end_date
),
BASE AS
(SELECT LOAN_ID,
PARTNER_LOAN_ID,
DISBURSAL_DATE,
FIRST_EMI_DUE_DATE,
LOAN_STATUS,
CURRENT_DPD_BKT AS LAST_MONTH_DPD_BKT,
PRINCIPAL_OUTSTANDING AS OPENING_PRINCIPAL,
LOAN_AMOUNT,
(MOB+1) AS MOB
FROM "PC_STITCH_DB"."NBFC_PROD"."LOAN_PERFORMANCE_DETAILS"
WHERE PARTNER_NAME IN ('PFL') AND LOAN_BOOK_DATE = DATE_TRUNC('month', current_date())-1),
NACH as
(select LOAN_ID,PRIMARY_NACH_STATUS from PC_STITCH_DB.NBFC_PROD.NACH_DATA_FINAL_V1 qualify row_number() over (partition by LOAN_ID order by PRIMARY_NACH_ACTIVATION_DATE desc)=1),
GPS AS
(SELECT * FROM DEMO_DB.PUBLIC.GPS_DATA),
BOUNCE_DATA AS (WITH historical_bounce_data AS
(SELECT LOAN_ID, BOUNCE_FLAG,
DATEDIFF(month, (select end_date from end_date), LOAN_BOOK_DATE) AS EV_MONTH
FROM (SELECT * FROM "PC_STITCH_DB"."NBFC_DWH_PRE_PROD"."CONSOLIDATED_ALL_PARTNER_BOUNCE" WHERE PARTNER = 'PFL' AND LOAN_BOOK_DATE >= Dateadd(Month, -12, (select end_date from end_date)) AND LOAN_BOOK_DATE < (select end_date from end_date))
)
SELECT TRIM(LOAN_ID) AS LOAN_ID,
//max(case when EV_MONTH >= -3 THEN current_dpd_bkt END) as plus30_ever,
count(distinct case when EV_MONTH >= -3 and bounce_flag = 1 then EV_MONTH end) as bounce_3m,
count(distinct case when EV_MONTH >= -6 and bounce_flag = 1 then EV_MONTH end) as bounce_6m,
count(distinct case when EV_MONTH >= -12 and bounce_flag = 1 then EV_MONTH end) as bounce_12m
FROM historical_bounce_data
GROUP BY LOAN_ID),
DPD_BKT AS (SELECT LOAN_ID, max(current_dpd_bkt) as max_dpd_3m FROM "PC_STITCH_DB"."NBFC_PROD"."LOAN_PERFORMANCE_DETAILS"
WHERE PARTNER_NAME in ('PFL') AND DATEDIFF(month, (select end_date from end_date), LOAN_BOOK_DATE)>=-3
GROUP BY 1),
CURRENT_DPD AS
(SELECT PROPOSAL_NO,DPD,DATE_TRUNC('month', MIS_DATE) AS MIS_MONTH,INST_DUE_DATE
from "PC_STITCH_DB"."NBFC_DWH_PRE_PROD"."PFL_DAILY_MIS"
WHERE MIS_MONTH = (SELECT start_date FROM start_date)
QUALIFY ROW_NUMBER() OVER (PARTITION BY PROPOSAL_NO ORDER BY MIS_DATE DESC,ETL_DATE DESC)=1)
SELECT BASE.LOAN_ID,
BASE.DISBURSAL_DATE,
BASE.FIRST_EMI_DUE_DATE AS FEMI_DATE,
BASE.LOAN_STATUS,
CASE WHEN NACH.PRIMARY_NACH_STATUS IS NULL THEN 'NO' WHEN NACH.PRIMARY_NACH_STATUS IN ('ACTIVE','Active') THEN 'YES' ELSE 'NO' END AS NACHE_REGISTERED,
CASE WHEN GPS.STATUS_FOR_SEGMENTATION IN ('Inactive') then 'Inactive' else 'Active' end AS FINAL_STATUS,
GREATEST(DATEDIFF(month,FEMI_DATE,(select end_date from end_date)),0) AS MOB,
COALESCE(BOUNCE_DATA.bounce_3m,0) AS BOUNCE_3M,
COALESCE(BOUNCE_DATA.bounce_6m,0) AS BOUNCE_6M,
COALESCE(BOUNCE_DATA.bounce_12m,0) AS BOUNCE_12M,
CASE WHEN Z.DPD <= 0 THEN 0
WHEN Z.DPD > 0 AND Z.DPD <= 30 THEN 1
WHEN Z.DPD > 30 AND Z.DPD <= 60 THEN 2
WHEN Z.DPD > 60 AND Z.DPD <= 90 THEN 3
WHEN Z.DPD > 90 AND Z.DPD <= 120 THEN 4
WHEN Z.DPD > 120 AND Z.DPD <= 150 THEN 5
WHEN Z.DPD > 150 AND Z.DPD <= 180 THEN 6
WHEN Z.DPD > 180 AND Z.DPD <= 210 THEN 7
WHEN Z.DPD > 210 AND Z.DPD <= 240 THEN 8
WHEN Z.DPD > 240 AND Z.DPD <= 270 THEN 9
WHEN Z.DPD > 270 AND Z.DPD <= 300 THEN 10
WHEN Z.DPD > 300 AND Z.DPD <= 330 THEN 11
WHEN Z.DPD > 330 AND Z.DPD <= 360 THEN 12
WHEN Z.DPD > 360 THEN 13 END AS CURRENT_BUCKET,
COALESCE(GREATEST(CURRENT_BUCKET,DPD_BKT.max_dpd_3m),0) AS MAX_DPD_BUCKET,
CASE WHEN BOUNCE_3M = 0 THEN 0
WHEN AVG_DPD.DPD_DAYS IS NULL THEN 0 ELSE (AVG_DPD.DPD_DAYS/BOUNCE_3M) END AS AVG_DPD_12M
FROM BASE
LEFT JOIN NACH ON BASE.LOAN_ID = NACH.LOAN_ID
LEFT JOIN GPS ON BASE.LOAN_ID = GPS.LOAN_ID
LEFT JOIN BOUNCE_DATA ON BASE.LOAN_ID = BOUNCE_DATA.LOAN_ID
LEFT JOIN DPD_BKT ON BASE.LOAN_ID = DPD_BKT.LOAN_ID
LEFT JOIN CURRENT_DPD Z ON BASE.PARTNER_LOAN_ID = Z.PROPOSAL_NO
LEFT JOIN DEMO_DB.PUBLIC.PFL_AVG_DPD_3M AVG_DPD ON BASE.LOAN_ID = AVG_DPD.LOAN_ID
WHERE LOAN_STATUS = 'Active'
""")
# read sql query
logging.info("Fetching data from segment query...")
segmentation_data = segmentation_data.to_pandas()
# segmentation_data.columns = segmentation_data.columns.str.lower()
# remove null values
logging.info(f"Running Segment Data query for {segmentation_data.columns}")
segmentation_data = segmentation_data[segmentation_data['LOAN_ID'].notnull()]
logging.info(f"Segmentation data is {segmentation_data}")
return segmentation_data
def df_processor(session, df):
df = df[['LOAN_ID','MOB','CURRENT_BUCKET','BOUNCE_3M','BOUNCE_6M','BOUNCE_12M',
'AVG_DPD_12M','MAX_DPD_BUCKET','NACHE_REGISTERED','FINAL_STATUS']]
df.columns = ['LOAN_ID', 'mob', 'current_bucket', 'bounce_3M', 'bounce_6M', 'bounce_12M', 'avg_dpd_12M', 'max_dpd_3M', 'nache_registered', 'gps_status'
]
logging.info(f"Columnsss are :{df.columns}")
# ## Corrections
# nach_mapper = {'No': 0, 'Yes': 1}
"""
gps_mapper = {'other' : 'Active', 'expired' : 'Active',
'active' : 'Active',
'nan' : 'Active',
'none' : 'Inactive',
'inactive' : 'Inactive',
'not installed' : 'Active',
'no information': 'Active',
'uninstalled' : 'Inactive',
}
df['gps_status'] = df['gps_status'].map(lambda x: gps_mapper[str(x).lower()])
"""
nach_mapper = {'NO': 0, 'YES': 1}
df['nache_registered'] = df['nache_registered'].map(nach_mapper)
## Corrections
logging.info(f"Processing df {df}")
return df
def segment_dict_extractor(session, df): """
Main Function to iterate over the rows in dataframe and
run the segment class (above) which classifies the case
in one of the S0-S7 segments
"""
segment_dict = {}
for i, row in df.iterrows():
segment_calculator = Segment(df[df['LOAN_ID'] == row.get("LOAN_ID")].to_dict(orient='records'))
segment_dict[row.get("LOAN_ID")] = segment_calculator.get_segment_final()
df.loc[i, "Segment Names"] = segment_dict[row.get("LOAN_ID")]
# df["LOAN_ID"].map(segment_dict)
logging.info(f"Segment dict is {segment_dict} and df is {df.columns}")
return df
def main(session: snowpark.Session):
df = fetch_data(session)
df = df_processor(session, df)
df = segment_dict_extractor(session, df)
snowpark_df = session.create_dataframe(df)
return snowpark_df
Download