Uploaded by Nishant Sharma

Cars24 Segmentation Script

advertisement
import snowflake.snowpark as snowpark
from snowflake.snowpark.functions import col
import logging
import sys
logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
# Importing all required modules
from datetime import datetime, timedelta
import dateutil.relativedelta
import math
import time
import os
import pandas as pd
import json
import numpy as np
from statistics import mean
import pandas as pd
from datetime import date, timedelta
from datetime import datetime, timedelta
import traceback
type_Code_mapping = None
from datetime import datetime, timedelta
import csv
import time
class Segment:
"""
Segment class to test all the rules for a loan id
which will help in classifying it into one of the 8 segments defined by collections team for the soft calling
"""
def __init__(self, _case):
self.case = _case[0]
self.calculate_mob()
self.calculate_bounce()
self.calculate_payment()
self.calculate_special()
def calculate_mob(self):
self.mob = self.case['mob']
return None
def calculate_bounce(self):
self.bounce_3M = self.case['bounce_3M']
self.bounce_6M = self.case['bounce_6M']
self.bounce_12M = self.case['bounce_12M']
return None
def calculate_payment(self):
self.avg_dpd_12M = self.case['avg_dpd_12M']
self.max_dpd_3M = self.case['max_dpd_3M']
return None
def calculate_special(self):
self.nache_registered = self.case['nache_registered']
self.gps_status = self.case['gps_status']
return None
def segment_123_mob(self):
if self.mob == 1:
_segment = 'S2' elif self.mob in [2,3]:
if self.nache_registered == 0:
_segment = 'S6'
elif self.bounce_3M == 0:
_segment = 'S2' else:
_segment = 'S6'
else:
_segment = None
return _segment
def segment_4_12_mob(self):
if (self.bounce_6M >= 4):
_segment = 'S7'
elif (self.bounce_6M > 1) and (self.bounce_6M < 4):
_segment = 'S5'
elif (self.bounce_6M == 1) and (self.bounce_3M == 1):
_segment = 'S4'
elif (self.nache_registered == 0) and (self.avg_dpd_12M <15):
_segment = 'S3'
elif (self.bounce_6M == 1) and (self.bounce_3M == 0):
_segment = 'S3'
elif self.bounce_6M == 0:
_segment = 'S1'
else:
_segment = None
return _segment
def segment_12_plus_mob(self):
if (self.bounce_12M >= 6):
_segment = 'S7'
elif (self.bounce_12M in [3,4,5]):
_segment = 'S5'
elif (self.bounce_12M in [1,2]) and (self.bounce_3M in [1,2]):
_segment = 'S4'
elif (self.nache_registered == 0) and (self.avg_dpd_12M <15):
_segment = 'S3'
elif (self.bounce_12M in [1,2]) and (self.bounce_3M == 0):
_segment = 'S3'
elif self.bounce_12M == 0: _segment = 'S1' else:
_segment = None
return _segment
def segment_special(self):
if self.avg_dpd_12M >= 15:
if (self.max_dpd_3M >= 1) or (self.gps_status != 'Active') or (self.nache_registered == 0):
_segment = 'S7'
else:
_segment = None
else:
_segment = None
return _segment
def get_segment(self):
if self.mob <= 3:
_segment = self.segment_123_mob()
elif self.mob >= 4:
_segment = self.segment_special()
if _segment is None:
if (self.mob >= 4) and (self.mob <= 12):
_segment = self.segment_4_12_mob()
elif self.mob > 12:
_segment = self.segment_12_plus_mob()
else:
_segment = None
else:
_segment = None
return _segment
def get_segment_final(self): _segment = self.get_segment()
if (self.mob in [0,1,2,3]):
_segment = 'S2'
elif (_segment == 'S6') and (self.avg_dpd_12M <= 15):
_segment = 'S2'
elif (_segment == 'S7') and (self.avg_dpd_12M <= 15):
_segment = 'S6'
elif (_segment in ['S4','S5','S6','S7']) and (self.avg_dpd_12M > 15):
_segment = 'S7'
else:
_segment = _segment
return _segment
def fetch_data(session):
#### Segmentation data query
segmentation_data = session.sql(f"""
WITH start_date AS (
SELECT DATE_TRUNC('month', current_date()) AS start_date
),
end_date AS (
SELECT last_day(current_date)+1 AS end_date
),
base_data AS (WITH A AS (SELECT TASK_ID_
,MAX(CASE WHEN NAME_='loanNumber' THEN TEXT_ end) as LMS_LOAN_ID
,MAX(CASE WHEN NAME_='loanId' THEN TEXT_ end) as LOAN_ID
,MAX(CASE WHEN NAME_='loanDpdValue' THEN TEXT_ end) as DPD
,MAX(CASE WHEN NAME_='oldActualDisbursementDate' THEN TEXT_ end) as Old_Disbursal_Date
,MAX(CASE WHEN NAME_='oldLoan' THEN TEXT_ end) as Old_Loan
,MAX(CASE WHEN NAME_='oldPrincipal' THEN TEXT_ end) as oldPrincipal
,MAX(CASE WHEN NAME_='closureType' THEN TEXT_ end) as Closure_Type
,MAX(CASE WHEN NAME_='foreclosureType' THEN TEXT_ end) as Foreclosure_Type
,MAX(CASE WHEN NAME_='oldNumberOfRepayments' THEN TEXT_ end) as oldNumberOfRepayments
FROM "PC_STITCH_DB"."LMS_NBFC_NEW_LMS_WORKFLOW"."ACT_RU_VARIABLE"
WHERE _FIVETRAN_DELETED = 'FALSE'
GROUP BY 1), B AS
(SELECT * FROM "PC_STITCH_DB"."LMS_NBFC_NEW_LMS_OAUTH_ROLE_SR"."M_LOAN" WHERE _FIVETRAN_DELETED = 'FALSE')
SELECT A.LOAN_ID,
Case when B.loan_status_id = 300 then 'Active'
when B.loan_status_id = 100 then 'Submitted and pending approval'
when B.loan_status_id = 200 then 'Approved'
when B.loan_status_id = 400 then 'Withdrawn by applicant'
when B.loan_status_id = 600 then 'Closed (obligations met)'
when B.loan_status_id = 700 then 'Overpaid'
when B.loan_status_id = 305 then 'Active Repossession'
end as LOAN_STATUS,
CASE WHEN A.Old_Loan IS NULL then B.principal_disbursed_derived else A.oldPrincipal end as LOAN_AMOUNT,
A.DPD AS CURRENT_DPD,
B.principal_outstanding_derived as PRINCIPAL_OUTSTANDING,
CASE WHEN A.Old_Loan IS NULL THEN B.disbursedon_date ELSE TO_DATE(A.Old_Disbursal_Date,'dd mmmm yyyy') END AS DATE_OF_LOAN_DISBURSEMENT
FROM A
LEFT JOIN B ON A.LMS_LOAN_ID = B.ACCOUNT_NO),
CURRENT_MONTH_BOUNCE AS (SELECT LOAN_ID, MONTH, EMI_DUE_DATE, BOUNCE_FLAG
FROM "PC_STITCH_DB"."NBFC_DWH_PRE_PROD"."CARS24_BOUNCE_DATA"
WHERE LOANBOOK_MONTH >= (select end_date from end_date) AND LOANBOOK_MONTH < Dateadd(Month, 1, (select end_date from end_date))),
h_bounce_data AS (WITH historical_bounce_data AS (
SELECT LOAN_ID, BOUNCE_FLAG, MONTH,
DATEDIFF(month, (select end_date from end_date), LOANBOOK_MONTH) AS EV_MONTH
FROM "PC_STITCH_DB"."NBFC_DWH_PRE_PROD"."CARS24_BOUNCE_DATA"
WHERE LOANBOOK_MONTH >= Dateadd(Month, -12, (select end_date from end_date))
AND LOANBOOK_MONTH < (select end_date from end_date)
ORDER BY EV_MONTH DESC)
SELECT TRIM(loan_id) AS LOAN_ID,
//max(case when EV_MONTH >= -3 THEN current_dpd_bkt END) as plus30_ever,
count(distinct case when EV_MONTH >= -3 and bounce_flag = 1 then EV_MONTH end) as bounce_3m,
count(distinct case when EV_MONTH >= -6 and bounce_flag = 1 then EV_MONTH end) as bounce_6m,
count(distinct case when EV_MONTH >= -12 and bounce_flag = 1 then EV_MONTH end) as bounce_12m
FROM historical_bounce_data
GROUP BY LOAN_ID),
NACH as
(select LOAN_ID,PRIMARY_NACH_STATUS from PC_STITCH_DB.NBFC_PROD.NACH_DATA_FINAL_V1 qualify row_number() over (partition by LOAN_ID order by PRIMARY_NACH_ACTIVATION_DATE desc)=1),
GPS AS
(SELECT * FROM DEMO_DB.PUBLIC.GPS_DATA_SEGMENTATION),
dpd_bkt AS (SELECT LOAN_ID, max(current_dpd_bkt) as plus30_ever FROM "PC_STITCH_DB"."NBFC_PROD"."LOAN_PERFORMANCE_DETAILS"
WHERE (LOWER(LOAN_STATUS) not like '%close%' AND LOWER(LOAN_STATUS) not like '%write%')
AND PARTNER_NAME in ('C24') AND DATEDIFF(month, (select end_date from end_date), LOAN_BOOK_DATE)>=-3
GROUP BY 1),
avg_dpd AS (WITH A AS (SELECT TASK_ID_
,MAX(CASE WHEN NAME_='loanNumber' THEN TEXT_ end) as LMS_LOAN_ID
,MAX(CASE WHEN NAME_='loanId' THEN TEXT_ end) as LOAN_ID
FROM "PC_STITCH_DB"."LMS_NBFC_NEW_LMS_WORKFLOW"."ACT_RU_VARIABLE"
WHERE _FIVETRAN_DELETED = 'FALSE'
GROUP BY 1),
B AS (SELECT * FROM A WHERE LMS_LOAN_ID is not NULL),
txn AS (SELECT txn_new.loan_id,txn_new.transaction_date,mapping_new.loan_repayment_schedule_id
FROM
(SELECT id ,loan_id,transaction_date FROM "PC_STITCH_DB"."LMS_NBFC_NEW_LMS_OAUTH_ROLE_SR"."M_LOAN_TRANSACTION" WHERE transaction_type_enum = 2 and is_reversed = 0) AS txn_new
LEFT JOIN
(SELECT * FROM "PC_STITCH_DB"."LMS_NBFC_NEW_LMS_OAUTH_ROLE_SR"."M_LOAN_TRANSACTION_REPAYMENT_SCHEDULE_MAPPING" WHERE _FIVETRAN_DELETED = 'FALSE') AS mapping_new
ON txn_new.id = mapping_new.loan_transaction_id
WHERE (principal_portion_derived is not null and principal_portion_derived != 0)),
base AS (select B.LOAN_ID,B.LMS_LOAN_ID as LMS_LOAN_ID,
RPS.DUEDATE as DUE_DATE,
(RPS.PRINCIPAL_AMOUNT+RPS.INTEREST_AMOUNT) as INSTALLMENTAMOUNT,
RPS.PRINCIPAL_AMOUNT as PRINCIPAL,
RPS.INTEREST_AMOUNT as INTREST,
txn.transaction_date AS PAYMENT_DATE
FROM "PC_STITCH_DB"."LMS_NBFC_NEW_LMS_OAUTH_ROLE_SR"."M_LOAN_REPAYMENT_SCHEDULE" RPS
LEFT JOIN B ON RPS.LOAN_ID = B.LMS_LOAN_ID
LEFT JOIN txn ON txn.loan_repayment_schedule_id=RPS.id
WHERE _FIVETRAN_DELETED = 'FALSE' ORDER BY LOAN_ID,INSTALLMENT)
SELECT loan_id,
sum(datediff(day, due_date, payment_Date)) as total_dpd
FROM base WHERE DUE_DATE > DATEADD(MONTH, -12, (select end_date from end_date)) AND DUE_DATE <= (select end_date from end_date)
AND LOWER(LOAN_ID) not like '%d%'
GROUP BY loan_id),
OPENING_DPD AS
(select LOAN_NUMBER_CAR24,
CASE WHEN DPD_VALUE <= 0 THEN 0
WHEN DPD_VALUE > 0 AND DPD_VALUE <= 30 THEN 1
WHEN DPD_VALUE > 30 AND DPD_VALUE <= 60 THEN 2
WHEN DPD_VALUE > 60 AND DPD_VALUE <= 90 THEN 3
WHEN DPD_VALUE > 90 AND DPD_VALUE <= 120 THEN 4
WHEN DPD_VALUE > 120 AND DPD_VALUE <= 150 THEN 5
WHEN DPD_VALUE > 150 AND DPD_VALUE <= 180 THEN 6
WHEN DPD_VALUE > 180 AND DPD_VALUE <= 210 THEN 7
WHEN DPD_VALUE > 210 AND DPD_VALUE <= 240 THEN 8
WHEN DPD_VALUE > 240 AND DPD_VALUE <= 270 THEN 9
WHEN DPD_VALUE > 270 AND DPD_VALUE <= 300 THEN 10
WHEN DPD_VALUE > 300 AND DPD_VALUE <= 330 THEN 11
WHEN DPD_VALUE > 330 AND DPD_VALUE <= 360 THEN 12
WHEN DPD_VALUE > 360 THEN 13 END AS OPENING_BUCKET from "PC_STITCH_DB"."LMS_NBFC_NEW_LMS_WORKFLOW"."CONSUMER_STATUS_RECORDRBI"
WHERE CREATED_DATE = (SELECT start_date FROM start_date) and _FIVETRAN_DELETED = 'FALSE'),
FEMI_NEW AS (WITH A AS
(SELECT TASK_ID_
,MAX(CASE WHEN NAME_='loanNumber' THEN TEXT_ end) as LMS_LOAN_ID
,MAX(CASE WHEN NAME_='loanId' THEN TEXT_ end) as Loan_ID
,MAX(CASE WHEN NAME_='repaymentScheduleOld' THEN parse_json(TEXT_) end) as json
,MAX(CASE WHEN NAME_='repaymentScheduleOld' THEN BYTEARRAY_ID_ end) as BYTEARRAY_ID_
FROM "PC_STITCH_DB"."LMS_NBFC_NEW_LMS_WORKFLOW"."ACT_RU_VARIABLE"
GROUP BY 1)
SELECT A.LOAN_ID,(PRINCIPAL_AMOUNT+INTEREST_AMOUNT) AS FIRST_EMI_AMOUNT,DUEDATE AS FIRST_EMI_DATE
from "PC_STITCH_DB"."LMS_NBFC_NEW_LMS_OAUTH_ROLE_SR"."M_LOAN_REPAYMENT_SCHEDULE" LRS
LEFT JOIN A ON A.LMS_LOAN_ID = LRS.LOAN_ID where _FIVETRAN_DELETED = 'FALSE'
AND INSTALLMENT = 1
),
FEMI_OLD AS
(WITH RPS AS (WITH A AS
(SELECT TASK_ID_
,MAX(CASE WHEN NAME_='loanNumber' THEN TEXT_ end) as LMS_LOAN_ID
,MAX(CASE WHEN NAME_='loanId' THEN TEXT_ end) as Loan_ID
,MAX(CASE WHEN NAME_='repaymentScheduleOld' THEN parse_json(TEXT_) end) as json
,MAX(CASE WHEN NAME_='repaymentScheduleOld' THEN BYTEARRAY_ID_ end) as BYTEARRAY_ID_
FROM "PC_STITCH_DB"."LMS_NBFC_NEW_LMS_WORKFLOW"."ACT_RU_VARIABLE"
GROUP BY 1)
SELECT TASK_ID_,
LMS_LOAN_ID,
Loan_ID,
CASE WHEN bytes_ IS NULL THEN json ELSE parse_json(try_HEX_DECODE_STRING(to_varchar(SUBSTR(bytes_,8)))) END AS json_final from A
LEFT JOIN "PC_STITCH_DB"."LMS_NBFC_NEW_LMS_WORKFLOW"."ACT_GE_BYTEARRAY" B ON A.BYTEARRAY_ID_ = B.ID_),
RPS_FINAL as
(SELECT d.Loan_ID,
d.LMS_LOAN_ID,
f.value:dueDate::string as dueDate,
f.value:openingPrincipal::string as openingPrincipal,
f.value:installmentAmount::string as installmentAmount,
f.value:principal::string as principal,
f.value:intrest::string as intrest,
f.value:moratoriumFlag::string as moratoriumFlag,
f.value:moratoriumInterestDue::string as moratoriumInterestDue,
f.value:closingPrincipal::string as closingPrincipal,
CASE WHEN duedate IS NULL THEN NULL
WHEN TRY_TO_DATE(duedate, 'YYYY-MM-DD') IS NOT NULL THEN TRY_TO_DATE(duedate, 'YYYY-MM-DD')
WHEN LEN(duedate)<4 THEN NULL
ELSE TO_DATE(TO_CHAR(TO_DATE(duedate, 'DD MON YYYY'), 'YYYY-MM-DD')) END AS Final_Due_Date,
MONTH(Final_Due_Date) AS RPS_MONTH,
YEAR(Final_Due_Date) AS RPS_YEAR
FROM RPS d, table(flatten(input=>d.json_final)) f)
SELECT LOAN_ID,Final_Due_Date AS FIRST_EMI_DATE FROM RPS_FINAL WHERE LOAN_ID NOT LIKE ('%D%')
QUALIFY ROW_NUMBER() OVER (PARTITION BY Loan_ID ORDER BY Final_Due_Date) = 1
)
SELECT Z.*,
CASE WHEN Z.CURRENT_DPD <= 0 THEN 0
WHEN Z.CURRENT_DPD > 0 AND Z.CURRENT_DPD <= 30 THEN 1
WHEN Z.CURRENT_DPD > 30 AND Z.CURRENT_DPD <= 60 THEN 2
WHEN Z.CURRENT_DPD > 60 AND Z.CURRENT_DPD <= 90 THEN 3
WHEN Z.CURRENT_DPD > 90 AND Z.CURRENT_DPD <= 120 THEN 4
WHEN Z.CURRENT_DPD > 120 AND Z.CURRENT_DPD <= 150 THEN 5
WHEN Z.CURRENT_DPD > 150 AND Z.CURRENT_DPD <= 180 THEN 6
WHEN Z.CURRENT_DPD > 180 AND Z.CURRENT_DPD <= 210 THEN 7
WHEN Z.CURRENT_DPD > 210 AND Z.CURRENT_DPD <= 240 THEN 8
WHEN Z.CURRENT_DPD > 240 AND Z.CURRENT_DPD <= 270 THEN 9
WHEN Z.CURRENT_DPD > 270 AND Z.CURRENT_DPD <= 300 THEN 10
WHEN Z.CURRENT_DPD > 300 AND Z.CURRENT_DPD <= 330 THEN 11
WHEN Z.CURRENT_DPD > 330 AND Z.CURRENT_DPD <= 360 THEN 12
WHEN Z.CURRENT_DPD > 360 THEN 13 END AS CURRENT_BUCKET, COALESCE(c.bounce_3m,0) AS bounce_3m, COALESCE(c.bounce_6m,0) AS bounce_6m, COALESCE(c.bounce_12m,0) AS bounce_12m, CASE WHEN COALESCE(c.bounce_12m,0)=0 THEN 0 ELSE round(d.total_dpd/c.bounce_12m) END AS AVG_DPD_12M,
GREATEST(O.OPENING_BUCKET, e.PLUS30_EVER) AS max_dpd_bucket,
--b.BOUNCE_FLAG,
CASE WHEN NACH.PRIMARY_NACH_STATUS IS NULL THEN 'NO' WHEN NACH.PRIMARY_NACH_STATUS IN ('ACTIVE','Active') then 'YES' else 'NO' end AS nache_registered,
CASE WHEN GPS.STATUS_FOR_SEGMENTATION IN ('Inactive') then 'Inactive' else 'Active' end AS FINAL_STATUS,
CASE WHEN FEMI_OLD.FIRST_EMI_DATE IS NULL THEN FEMI_NEW.FIRST_EMI_DATE ELSE FEMI_OLD.FIRST_EMI_DATE END AS FIRST_EMI,
DATEDIFF(month,FIRST_EMI,(select end_date from end_date)) AS MOB
FROM BASE_DATA Z
--LEFT JOIN CURRENT_MONTH_BOUNCE b ON Z.LOAN_ID = b.LOAN_ID
LEFT JOIN h_bounce_data c ON Z.LOAN_ID = c.LOAN_ID
LEFT JOIN avg_dpd d ON Z.LOAN_ID = d.LOAN_ID
LEFT JOIN dpd_bkt e ON Z.LOAN_ID = e.LOAN_ID
LEFT JOIN NACH ON Z.LOAN_ID = NACH.LOAN_ID
LEFT JOIN GPS ON Z.LOAN_ID = GPS.LOAN_ID
LEFT JOIN OPENING_DPD O ON Z.LOAN_ID = O.LOAN_NUMBER_CAR24
LEFT JOIN FEMI_NEW ON Z.LOAN_ID = FEMI_NEW.LOAN_ID
LEFT JOIN FEMI_OLD ON Z.LOAN_ID = FEMI_OLD.LOAN_ID
WHERE Z.LOAN_STATUS = 'Active'
""")
# read sql query
logging.info("Fetching data from segment query...")
segmentation_data = segmentation_data.to_pandas()
# segmentation_data.columns = segmentation_data.columns.str.lower()
# remove null values
logging.info(f"Running Segment Data query for {segmentation_data.columns}")
segmentation_data = segmentation_data[segmentation_data['LOAN_ID'].notnull()]
logging.info(f"Segmentation data is {segmentation_data}")
return segmentation_data
def df_processor(session, df):
df = df[['LOAN_ID','MOB','CURRENT_BUCKET','BOUNCE_3M','BOUNCE_6M','BOUNCE_12M',
'AVG_DPD_12M','MAX_DPD_BUCKET','NACHE_REGISTERED','FINAL_STATUS']]
df.columns = ['LOAN_ID', 'mob', 'current_bucket', 'bounce_3M', 'bounce_6M', 'bounce_12M', 'avg_dpd_12M', 'max_dpd_3M', 'nache_registered', 'gps_status'
]
logging.info(f"Columnsss are :{df.columns}")
# ## Corrections
# nach_mapper = {'No': 0, 'Yes': 1}
"""
gps_mapper = {'other' : 'Active', 'expired' : 'Active',
'active' : 'Active',
'nan' : 'Active',
'none' : 'Inactive',
'inactive' : 'Inactive',
'not installed' : 'Active',
'no information': 'Active',
'uninstalled' : 'Inactive',
}
df['gps_status'] = df['gps_status'].map(lambda x: gps_mapper[str(x).lower()])
"""
nach_mapper = {'NO': 0, 'YES': 1}
df['nache_registered'] = df['nache_registered'].map(nach_mapper)
## Corrections
logging.info(f"Processing df {df}")
return df
def segment_dict_extractor(session, df): """
Main Function to iterate over the rows in dataframe and
run the segment class (above) which classifies the case
in one of the S0-S7 segments
"""
segment_dict = {}
for i, row in df.iterrows():
segment_calculator = Segment(df[df['LOAN_ID'] == row.get("LOAN_ID")].to_dict(orient='records'))
segment_dict[row.get("LOAN_ID")] = segment_calculator.get_segment_final()
df.loc[i, "Segment Names"] = segment_dict[row.get("LOAN_ID")]
# df["LOAN_ID"].map(segment_dict)
logging.info(f"Segment dict is {segment_dict} and df is {df.columns}")
return df
def main(session: snowpark.Session):
df = fetch_data(session)
df = df_processor(session, df)
df = segment_dict_extractor(session, df)
snowpark_df = session.create_dataframe(df)
return snowpark_df
Download