import snowflake.snowpark as snowpark from snowflake.snowpark.functions import col import logging import sys logging.basicConfig(stream=sys.stdout, level=logging.DEBUG) # Importing all required modules from datetime import datetime, timedelta import dateutil.relativedelta import math import time import os import pandas as pd import json import numpy as np from statistics import mean import pandas as pd from datetime import date, timedelta from datetime import datetime, timedelta import traceback type_Code_mapping = None from datetime import datetime, timedelta import csv import time class Segment: """ Segment class to test all the rules for a loan id which will help in classifying it into one of the 8 segments defined by collections team for the soft calling """ def __init__(self, _case): self.case = _case[0] self.calculate_mob() self.calculate_bounce() self.calculate_payment() self.calculate_special() def calculate_mob(self): self.mob = self.case['mob'] return None def calculate_bounce(self): self.bounce_3M = self.case['bounce_3M'] self.bounce_6M = self.case['bounce_6M'] self.bounce_12M = self.case['bounce_12M'] return None def calculate_payment(self): self.avg_dpd_12M = self.case['avg_dpd_12M'] self.max_dpd_3M = self.case['max_dpd_3M'] return None def calculate_special(self): self.nache_registered = self.case['nache_registered'] self.gps_status = self.case['gps_status'] return None def segment_123_mob(self): if self.mob == 1: _segment = 'S2' elif self.mob in [2,3]: if self.nache_registered == 0: _segment = 'S6' elif self.bounce_3M == 0: _segment = 'S2' else: _segment = 'S6' else: _segment = None return _segment def segment_4_12_mob(self): if (self.bounce_6M >= 4): _segment = 'S7' elif (self.bounce_6M > 1) and (self.bounce_6M < 4): _segment = 'S5' elif (self.bounce_6M == 1) and (self.bounce_3M == 1): _segment = 'S4' elif (self.nache_registered == 0) and (self.avg_dpd_12M <15): _segment = 'S3' elif (self.bounce_6M == 1) and (self.bounce_3M == 0): _segment = 'S3' elif self.bounce_6M == 0: _segment = 'S1' else: _segment = None return _segment def segment_12_plus_mob(self): if (self.bounce_12M >= 6): _segment = 'S7' elif (self.bounce_12M in [3,4,5]): _segment = 'S5' elif (self.bounce_12M in [1,2]) and (self.bounce_3M in [1,2]): _segment = 'S4' elif (self.nache_registered == 0) and (self.avg_dpd_12M <15): _segment = 'S3' elif (self.bounce_12M in [1,2]) and (self.bounce_3M == 0): _segment = 'S3' elif self.bounce_12M == 0: _segment = 'S1' else: _segment = None return _segment def segment_special(self): if self.avg_dpd_12M >= 15: if (self.max_dpd_3M >= 1) or (self.gps_status != 'Active') or (self.nache_registered == 0): _segment = 'S7' else: _segment = None else: _segment = None return _segment def get_segment(self): if self.mob <= 3: _segment = self.segment_123_mob() elif self.mob >= 4: _segment = self.segment_special() if _segment is None: if (self.mob >= 4) and (self.mob <= 12): _segment = self.segment_4_12_mob() elif self.mob > 12: _segment = self.segment_12_plus_mob() else: _segment = None else: _segment = None return _segment def get_segment_final(self): _segment = self.get_segment() if (self.mob in [0,1,2,3]): _segment = 'S2' elif (_segment == 'S6') and (self.avg_dpd_12M <= 15): _segment = 'S2' elif (_segment == 'S7') and (self.avg_dpd_12M <= 15): _segment = 'S6' elif (_segment in ['S4','S5','S6','S7']) and (self.avg_dpd_12M > 15): _segment = 'S7' else: _segment = _segment return _segment def fetch_data(session): #### Segmentation data query segmentation_data = session.sql(f""" WITH start_date AS ( SELECT DATE_TRUNC('month', current_date()) AS start_date ), end_date AS ( SELECT last_day(current_date)+1 AS end_date ), base_data AS (WITH A AS (SELECT TASK_ID_ ,MAX(CASE WHEN NAME_='loanNumber' THEN TEXT_ end) as LMS_LOAN_ID ,MAX(CASE WHEN NAME_='loanId' THEN TEXT_ end) as LOAN_ID ,MAX(CASE WHEN NAME_='loanDpdValue' THEN TEXT_ end) as DPD ,MAX(CASE WHEN NAME_='oldActualDisbursementDate' THEN TEXT_ end) as Old_Disbursal_Date ,MAX(CASE WHEN NAME_='oldLoan' THEN TEXT_ end) as Old_Loan ,MAX(CASE WHEN NAME_='oldPrincipal' THEN TEXT_ end) as oldPrincipal ,MAX(CASE WHEN NAME_='closureType' THEN TEXT_ end) as Closure_Type ,MAX(CASE WHEN NAME_='foreclosureType' THEN TEXT_ end) as Foreclosure_Type ,MAX(CASE WHEN NAME_='oldNumberOfRepayments' THEN TEXT_ end) as oldNumberOfRepayments FROM "PC_STITCH_DB"."LMS_NBFC_NEW_LMS_WORKFLOW"."ACT_RU_VARIABLE" WHERE _FIVETRAN_DELETED = 'FALSE' GROUP BY 1), B AS (SELECT * FROM "PC_STITCH_DB"."LMS_NBFC_NEW_LMS_OAUTH_ROLE_SR"."M_LOAN" WHERE _FIVETRAN_DELETED = 'FALSE') SELECT A.LOAN_ID, Case when B.loan_status_id = 300 then 'Active' when B.loan_status_id = 100 then 'Submitted and pending approval' when B.loan_status_id = 200 then 'Approved' when B.loan_status_id = 400 then 'Withdrawn by applicant' when B.loan_status_id = 600 then 'Closed (obligations met)' when B.loan_status_id = 700 then 'Overpaid' when B.loan_status_id = 305 then 'Active Repossession' end as LOAN_STATUS, CASE WHEN A.Old_Loan IS NULL then B.principal_disbursed_derived else A.oldPrincipal end as LOAN_AMOUNT, A.DPD AS CURRENT_DPD, B.principal_outstanding_derived as PRINCIPAL_OUTSTANDING, CASE WHEN A.Old_Loan IS NULL THEN B.disbursedon_date ELSE TO_DATE(A.Old_Disbursal_Date,'dd mmmm yyyy') END AS DATE_OF_LOAN_DISBURSEMENT FROM A LEFT JOIN B ON A.LMS_LOAN_ID = B.ACCOUNT_NO), CURRENT_MONTH_BOUNCE AS (SELECT LOAN_ID, MONTH, EMI_DUE_DATE, BOUNCE_FLAG FROM "PC_STITCH_DB"."NBFC_DWH_PRE_PROD"."CARS24_BOUNCE_DATA" WHERE LOANBOOK_MONTH >= (select end_date from end_date) AND LOANBOOK_MONTH < Dateadd(Month, 1, (select end_date from end_date))), h_bounce_data AS (WITH historical_bounce_data AS ( SELECT LOAN_ID, BOUNCE_FLAG, MONTH, DATEDIFF(month, (select end_date from end_date), LOANBOOK_MONTH) AS EV_MONTH FROM "PC_STITCH_DB"."NBFC_DWH_PRE_PROD"."CARS24_BOUNCE_DATA" WHERE LOANBOOK_MONTH >= Dateadd(Month, -12, (select end_date from end_date)) AND LOANBOOK_MONTH < (select end_date from end_date) ORDER BY EV_MONTH DESC) SELECT TRIM(loan_id) AS LOAN_ID, //max(case when EV_MONTH >= -3 THEN current_dpd_bkt END) as plus30_ever, count(distinct case when EV_MONTH >= -3 and bounce_flag = 1 then EV_MONTH end) as bounce_3m, count(distinct case when EV_MONTH >= -6 and bounce_flag = 1 then EV_MONTH end) as bounce_6m, count(distinct case when EV_MONTH >= -12 and bounce_flag = 1 then EV_MONTH end) as bounce_12m FROM historical_bounce_data GROUP BY LOAN_ID), NACH as (select LOAN_ID,PRIMARY_NACH_STATUS from PC_STITCH_DB.NBFC_PROD.NACH_DATA_FINAL_V1 qualify row_number() over (partition by LOAN_ID order by PRIMARY_NACH_ACTIVATION_DATE desc)=1), GPS AS (SELECT * FROM DEMO_DB.PUBLIC.GPS_DATA_SEGMENTATION), dpd_bkt AS (SELECT LOAN_ID, max(current_dpd_bkt) as plus30_ever FROM "PC_STITCH_DB"."NBFC_PROD"."LOAN_PERFORMANCE_DETAILS" WHERE (LOWER(LOAN_STATUS) not like '%close%' AND LOWER(LOAN_STATUS) not like '%write%') AND PARTNER_NAME in ('C24') AND DATEDIFF(month, (select end_date from end_date), LOAN_BOOK_DATE)>=-3 GROUP BY 1), avg_dpd AS (WITH A AS (SELECT TASK_ID_ ,MAX(CASE WHEN NAME_='loanNumber' THEN TEXT_ end) as LMS_LOAN_ID ,MAX(CASE WHEN NAME_='loanId' THEN TEXT_ end) as LOAN_ID FROM "PC_STITCH_DB"."LMS_NBFC_NEW_LMS_WORKFLOW"."ACT_RU_VARIABLE" WHERE _FIVETRAN_DELETED = 'FALSE' GROUP BY 1), B AS (SELECT * FROM A WHERE LMS_LOAN_ID is not NULL), txn AS (SELECT txn_new.loan_id,txn_new.transaction_date,mapping_new.loan_repayment_schedule_id FROM (SELECT id ,loan_id,transaction_date FROM "PC_STITCH_DB"."LMS_NBFC_NEW_LMS_OAUTH_ROLE_SR"."M_LOAN_TRANSACTION" WHERE transaction_type_enum = 2 and is_reversed = 0) AS txn_new LEFT JOIN (SELECT * FROM "PC_STITCH_DB"."LMS_NBFC_NEW_LMS_OAUTH_ROLE_SR"."M_LOAN_TRANSACTION_REPAYMENT_SCHEDULE_MAPPING" WHERE _FIVETRAN_DELETED = 'FALSE') AS mapping_new ON txn_new.id = mapping_new.loan_transaction_id WHERE (principal_portion_derived is not null and principal_portion_derived != 0)), base AS (select B.LOAN_ID,B.LMS_LOAN_ID as LMS_LOAN_ID, RPS.DUEDATE as DUE_DATE, (RPS.PRINCIPAL_AMOUNT+RPS.INTEREST_AMOUNT) as INSTALLMENTAMOUNT, RPS.PRINCIPAL_AMOUNT as PRINCIPAL, RPS.INTEREST_AMOUNT as INTREST, txn.transaction_date AS PAYMENT_DATE FROM "PC_STITCH_DB"."LMS_NBFC_NEW_LMS_OAUTH_ROLE_SR"."M_LOAN_REPAYMENT_SCHEDULE" RPS LEFT JOIN B ON RPS.LOAN_ID = B.LMS_LOAN_ID LEFT JOIN txn ON txn.loan_repayment_schedule_id=RPS.id WHERE _FIVETRAN_DELETED = 'FALSE' ORDER BY LOAN_ID,INSTALLMENT) SELECT loan_id, sum(datediff(day, due_date, payment_Date)) as total_dpd FROM base WHERE DUE_DATE > DATEADD(MONTH, -12, (select end_date from end_date)) AND DUE_DATE <= (select end_date from end_date) AND LOWER(LOAN_ID) not like '%d%' GROUP BY loan_id), OPENING_DPD AS (select LOAN_NUMBER_CAR24, CASE WHEN DPD_VALUE <= 0 THEN 0 WHEN DPD_VALUE > 0 AND DPD_VALUE <= 30 THEN 1 WHEN DPD_VALUE > 30 AND DPD_VALUE <= 60 THEN 2 WHEN DPD_VALUE > 60 AND DPD_VALUE <= 90 THEN 3 WHEN DPD_VALUE > 90 AND DPD_VALUE <= 120 THEN 4 WHEN DPD_VALUE > 120 AND DPD_VALUE <= 150 THEN 5 WHEN DPD_VALUE > 150 AND DPD_VALUE <= 180 THEN 6 WHEN DPD_VALUE > 180 AND DPD_VALUE <= 210 THEN 7 WHEN DPD_VALUE > 210 AND DPD_VALUE <= 240 THEN 8 WHEN DPD_VALUE > 240 AND DPD_VALUE <= 270 THEN 9 WHEN DPD_VALUE > 270 AND DPD_VALUE <= 300 THEN 10 WHEN DPD_VALUE > 300 AND DPD_VALUE <= 330 THEN 11 WHEN DPD_VALUE > 330 AND DPD_VALUE <= 360 THEN 12 WHEN DPD_VALUE > 360 THEN 13 END AS OPENING_BUCKET from "PC_STITCH_DB"."LMS_NBFC_NEW_LMS_WORKFLOW"."CONSUMER_STATUS_RECORDRBI" WHERE CREATED_DATE = (SELECT start_date FROM start_date) and _FIVETRAN_DELETED = 'FALSE'), FEMI_NEW AS (WITH A AS (SELECT TASK_ID_ ,MAX(CASE WHEN NAME_='loanNumber' THEN TEXT_ end) as LMS_LOAN_ID ,MAX(CASE WHEN NAME_='loanId' THEN TEXT_ end) as Loan_ID ,MAX(CASE WHEN NAME_='repaymentScheduleOld' THEN parse_json(TEXT_) end) as json ,MAX(CASE WHEN NAME_='repaymentScheduleOld' THEN BYTEARRAY_ID_ end) as BYTEARRAY_ID_ FROM "PC_STITCH_DB"."LMS_NBFC_NEW_LMS_WORKFLOW"."ACT_RU_VARIABLE" GROUP BY 1) SELECT A.LOAN_ID,(PRINCIPAL_AMOUNT+INTEREST_AMOUNT) AS FIRST_EMI_AMOUNT,DUEDATE AS FIRST_EMI_DATE from "PC_STITCH_DB"."LMS_NBFC_NEW_LMS_OAUTH_ROLE_SR"."M_LOAN_REPAYMENT_SCHEDULE" LRS LEFT JOIN A ON A.LMS_LOAN_ID = LRS.LOAN_ID where _FIVETRAN_DELETED = 'FALSE' AND INSTALLMENT = 1 ), FEMI_OLD AS (WITH RPS AS (WITH A AS (SELECT TASK_ID_ ,MAX(CASE WHEN NAME_='loanNumber' THEN TEXT_ end) as LMS_LOAN_ID ,MAX(CASE WHEN NAME_='loanId' THEN TEXT_ end) as Loan_ID ,MAX(CASE WHEN NAME_='repaymentScheduleOld' THEN parse_json(TEXT_) end) as json ,MAX(CASE WHEN NAME_='repaymentScheduleOld' THEN BYTEARRAY_ID_ end) as BYTEARRAY_ID_ FROM "PC_STITCH_DB"."LMS_NBFC_NEW_LMS_WORKFLOW"."ACT_RU_VARIABLE" GROUP BY 1) SELECT TASK_ID_, LMS_LOAN_ID, Loan_ID, CASE WHEN bytes_ IS NULL THEN json ELSE parse_json(try_HEX_DECODE_STRING(to_varchar(SUBSTR(bytes_,8)))) END AS json_final from A LEFT JOIN "PC_STITCH_DB"."LMS_NBFC_NEW_LMS_WORKFLOW"."ACT_GE_BYTEARRAY" B ON A.BYTEARRAY_ID_ = B.ID_), RPS_FINAL as (SELECT d.Loan_ID, d.LMS_LOAN_ID, f.value:dueDate::string as dueDate, f.value:openingPrincipal::string as openingPrincipal, f.value:installmentAmount::string as installmentAmount, f.value:principal::string as principal, f.value:intrest::string as intrest, f.value:moratoriumFlag::string as moratoriumFlag, f.value:moratoriumInterestDue::string as moratoriumInterestDue, f.value:closingPrincipal::string as closingPrincipal, CASE WHEN duedate IS NULL THEN NULL WHEN TRY_TO_DATE(duedate, 'YYYY-MM-DD') IS NOT NULL THEN TRY_TO_DATE(duedate, 'YYYY-MM-DD') WHEN LEN(duedate)<4 THEN NULL ELSE TO_DATE(TO_CHAR(TO_DATE(duedate, 'DD MON YYYY'), 'YYYY-MM-DD')) END AS Final_Due_Date, MONTH(Final_Due_Date) AS RPS_MONTH, YEAR(Final_Due_Date) AS RPS_YEAR FROM RPS d, table(flatten(input=>d.json_final)) f) SELECT LOAN_ID,Final_Due_Date AS FIRST_EMI_DATE FROM RPS_FINAL WHERE LOAN_ID NOT LIKE ('%D%') QUALIFY ROW_NUMBER() OVER (PARTITION BY Loan_ID ORDER BY Final_Due_Date) = 1 ) SELECT Z.*, CASE WHEN Z.CURRENT_DPD <= 0 THEN 0 WHEN Z.CURRENT_DPD > 0 AND Z.CURRENT_DPD <= 30 THEN 1 WHEN Z.CURRENT_DPD > 30 AND Z.CURRENT_DPD <= 60 THEN 2 WHEN Z.CURRENT_DPD > 60 AND Z.CURRENT_DPD <= 90 THEN 3 WHEN Z.CURRENT_DPD > 90 AND Z.CURRENT_DPD <= 120 THEN 4 WHEN Z.CURRENT_DPD > 120 AND Z.CURRENT_DPD <= 150 THEN 5 WHEN Z.CURRENT_DPD > 150 AND Z.CURRENT_DPD <= 180 THEN 6 WHEN Z.CURRENT_DPD > 180 AND Z.CURRENT_DPD <= 210 THEN 7 WHEN Z.CURRENT_DPD > 210 AND Z.CURRENT_DPD <= 240 THEN 8 WHEN Z.CURRENT_DPD > 240 AND Z.CURRENT_DPD <= 270 THEN 9 WHEN Z.CURRENT_DPD > 270 AND Z.CURRENT_DPD <= 300 THEN 10 WHEN Z.CURRENT_DPD > 300 AND Z.CURRENT_DPD <= 330 THEN 11 WHEN Z.CURRENT_DPD > 330 AND Z.CURRENT_DPD <= 360 THEN 12 WHEN Z.CURRENT_DPD > 360 THEN 13 END AS CURRENT_BUCKET, COALESCE(c.bounce_3m,0) AS bounce_3m, COALESCE(c.bounce_6m,0) AS bounce_6m, COALESCE(c.bounce_12m,0) AS bounce_12m, CASE WHEN COALESCE(c.bounce_12m,0)=0 THEN 0 ELSE round(d.total_dpd/c.bounce_12m) END AS AVG_DPD_12M, GREATEST(O.OPENING_BUCKET, e.PLUS30_EVER) AS max_dpd_bucket, --b.BOUNCE_FLAG, CASE WHEN NACH.PRIMARY_NACH_STATUS IS NULL THEN 'NO' WHEN NACH.PRIMARY_NACH_STATUS IN ('ACTIVE','Active') then 'YES' else 'NO' end AS nache_registered, CASE WHEN GPS.STATUS_FOR_SEGMENTATION IN ('Inactive') then 'Inactive' else 'Active' end AS FINAL_STATUS, CASE WHEN FEMI_OLD.FIRST_EMI_DATE IS NULL THEN FEMI_NEW.FIRST_EMI_DATE ELSE FEMI_OLD.FIRST_EMI_DATE END AS FIRST_EMI, DATEDIFF(month,FIRST_EMI,(select end_date from end_date)) AS MOB FROM BASE_DATA Z --LEFT JOIN CURRENT_MONTH_BOUNCE b ON Z.LOAN_ID = b.LOAN_ID LEFT JOIN h_bounce_data c ON Z.LOAN_ID = c.LOAN_ID LEFT JOIN avg_dpd d ON Z.LOAN_ID = d.LOAN_ID LEFT JOIN dpd_bkt e ON Z.LOAN_ID = e.LOAN_ID LEFT JOIN NACH ON Z.LOAN_ID = NACH.LOAN_ID LEFT JOIN GPS ON Z.LOAN_ID = GPS.LOAN_ID LEFT JOIN OPENING_DPD O ON Z.LOAN_ID = O.LOAN_NUMBER_CAR24 LEFT JOIN FEMI_NEW ON Z.LOAN_ID = FEMI_NEW.LOAN_ID LEFT JOIN FEMI_OLD ON Z.LOAN_ID = FEMI_OLD.LOAN_ID WHERE Z.LOAN_STATUS = 'Active' """) # read sql query logging.info("Fetching data from segment query...") segmentation_data = segmentation_data.to_pandas() # segmentation_data.columns = segmentation_data.columns.str.lower() # remove null values logging.info(f"Running Segment Data query for {segmentation_data.columns}") segmentation_data = segmentation_data[segmentation_data['LOAN_ID'].notnull()] logging.info(f"Segmentation data is {segmentation_data}") return segmentation_data def df_processor(session, df): df = df[['LOAN_ID','MOB','CURRENT_BUCKET','BOUNCE_3M','BOUNCE_6M','BOUNCE_12M', 'AVG_DPD_12M','MAX_DPD_BUCKET','NACHE_REGISTERED','FINAL_STATUS']] df.columns = ['LOAN_ID', 'mob', 'current_bucket', 'bounce_3M', 'bounce_6M', 'bounce_12M', 'avg_dpd_12M', 'max_dpd_3M', 'nache_registered', 'gps_status' ] logging.info(f"Columnsss are :{df.columns}") # ## Corrections # nach_mapper = {'No': 0, 'Yes': 1} """ gps_mapper = {'other' : 'Active', 'expired' : 'Active', 'active' : 'Active', 'nan' : 'Active', 'none' : 'Inactive', 'inactive' : 'Inactive', 'not installed' : 'Active', 'no information': 'Active', 'uninstalled' : 'Inactive', } df['gps_status'] = df['gps_status'].map(lambda x: gps_mapper[str(x).lower()]) """ nach_mapper = {'NO': 0, 'YES': 1} df['nache_registered'] = df['nache_registered'].map(nach_mapper) ## Corrections logging.info(f"Processing df {df}") return df def segment_dict_extractor(session, df): """ Main Function to iterate over the rows in dataframe and run the segment class (above) which classifies the case in one of the S0-S7 segments """ segment_dict = {} for i, row in df.iterrows(): segment_calculator = Segment(df[df['LOAN_ID'] == row.get("LOAN_ID")].to_dict(orient='records')) segment_dict[row.get("LOAN_ID")] = segment_calculator.get_segment_final() df.loc[i, "Segment Names"] = segment_dict[row.get("LOAN_ID")] # df["LOAN_ID"].map(segment_dict) logging.info(f"Segment dict is {segment_dict} and df is {df.columns}") return df def main(session: snowpark.Session): df = fetch_data(session) df = df_processor(session, df) df = segment_dict_extractor(session, df) snowpark_df = session.create_dataframe(df) return snowpark_df